Spaces:
Sleeping
Sleeping
Luis Chaves
commited on
Commit
Β·
70b960d
1
Parent(s):
550c7ec
moved things around, cleaned up files, final debug of dokcerfile
Browse files- .DS_Store +0 -0
- Dockerfile +1 -0
- explore.py +0 -158
- j.json +0 -1
- answers.md β mds/answers.md +0 -0
- learning.md β mds/learning.md +0 -0
- manual_test_api.py β misc/manual_test_api.py +0 -0
- tests/test_api.py +3 -3
.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
Dockerfile
CHANGED
@@ -4,6 +4,7 @@ FROM python:3.12-slim
|
|
4 |
RUN useradd -m appuser
|
5 |
USER appuser
|
6 |
ENV HOME=/home/appuser
|
|
|
7 |
WORKDIR /code
|
8 |
|
9 |
COPY --chown=appuser:appuser ./pyproject.toml /code/
|
|
|
4 |
RUN useradd -m appuser
|
5 |
USER appuser
|
6 |
ENV HOME=/home/appuser
|
7 |
+
ENV PATH="${HOME}/.local/bin:${PATH}"
|
8 |
WORKDIR /code
|
9 |
|
10 |
COPY --chown=appuser:appuser ./pyproject.toml /code/
|
explore.py
DELETED
@@ -1,158 +0,0 @@
|
|
1 |
-
import marimo
|
2 |
-
|
3 |
-
__generated_with = "0.10.17"
|
4 |
-
app = marimo.App(width="medium")
|
5 |
-
|
6 |
-
|
7 |
-
@app.cell
|
8 |
-
def _():
|
9 |
-
import marimo as mo
|
10 |
-
import pdfplumber
|
11 |
-
return mo, pdfplumber
|
12 |
-
|
13 |
-
|
14 |
-
@app.cell
|
15 |
-
def _(mo):
|
16 |
-
file = mo.ui.file_browser(initial_path="")
|
17 |
-
file
|
18 |
-
return (file,)
|
19 |
-
|
20 |
-
|
21 |
-
@app.cell
|
22 |
-
def _(file, pdfplumber):
|
23 |
-
with pdfplumber.open(file.path()) as pdf:
|
24 |
-
# Join all pages into single string
|
25 |
-
pdf_text = " ".join(p.extract_text() for p in pdf.pages)
|
26 |
-
return pdf, pdf_text
|
27 |
-
|
28 |
-
|
29 |
-
@app.cell
|
30 |
-
def _():
|
31 |
-
from everycure.extract import extract_pdf_entities
|
32 |
-
return (extract_pdf_entities,)
|
33 |
-
|
34 |
-
|
35 |
-
@app.cell
|
36 |
-
def _(pdf_text):
|
37 |
-
from transformers import pipeline
|
38 |
-
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
39 |
-
|
40 |
-
tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
|
41 |
-
model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")
|
42 |
-
|
43 |
-
pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # pass device=0 if using gpu
|
44 |
-
result = pipe(pdf_text)
|
45 |
-
result
|
46 |
-
return (
|
47 |
-
AutoModelForTokenClassification,
|
48 |
-
AutoTokenizer,
|
49 |
-
model,
|
50 |
-
pipe,
|
51 |
-
pipeline,
|
52 |
-
result,
|
53 |
-
tokenizer,
|
54 |
-
)
|
55 |
-
|
56 |
-
|
57 |
-
@app.cell
|
58 |
-
def _():
|
59 |
-
from gliner import GLiNER
|
60 |
-
|
61 |
-
# Curated medical labels based on your domain needs
|
62 |
-
MEDICAL_LABELS = [
|
63 |
-
# Parent: NamedThing -> biological_entity
|
64 |
-
"gene",
|
65 |
-
"protein",
|
66 |
-
"protein_isoform",
|
67 |
-
"cell",
|
68 |
-
"disease",
|
69 |
-
"phenotypic_feature",
|
70 |
-
"clinical_finding",
|
71 |
-
"anatomical_entity",
|
72 |
-
"pathway",
|
73 |
-
"biological_process",
|
74 |
-
|
75 |
-
# Parent: NamedThing -> chemical_entity
|
76 |
-
"drug",
|
77 |
-
"small_molecule",
|
78 |
-
"food_additive",
|
79 |
-
"chemical_mixture",
|
80 |
-
"molecular_entity",
|
81 |
-
|
82 |
-
# Parent: NamedThing -> clinical_entity
|
83 |
-
"clinical_intervention",
|
84 |
-
"clinical_trial",
|
85 |
-
"hospitalization",
|
86 |
-
|
87 |
-
# Parent: NamedThing -> planetary_entity
|
88 |
-
"geographic_location",
|
89 |
-
"environmental_feature",
|
90 |
-
"environmental_process",
|
91 |
-
|
92 |
-
# Parent: NamedThing -> information_content_entity
|
93 |
-
"publication",
|
94 |
-
"journal_article",
|
95 |
-
"book",
|
96 |
-
"patent",
|
97 |
-
"dataset",
|
98 |
-
"study_result",
|
99 |
-
|
100 |
-
# Parent: NamedThing -> organismal_entity
|
101 |
-
"human",
|
102 |
-
"mammal",
|
103 |
-
"plant",
|
104 |
-
"virus",
|
105 |
-
"bacterium",
|
106 |
-
"cell_line",
|
107 |
-
|
108 |
-
# Parent: NamedThing -> attribute
|
109 |
-
"biological_sex",
|
110 |
-
"clinical_attribute",
|
111 |
-
"socioeconomic_attribute",
|
112 |
-
"environmental_exposure",
|
113 |
-
"drug_exposure",
|
114 |
-
|
115 |
-
# Parent: NamedThing -> procedure
|
116 |
-
"procedure",
|
117 |
-
|
118 |
-
# Parent: NamedThing -> treatment
|
119 |
-
"treatment",
|
120 |
-
|
121 |
-
# Parent: NamedThing -> device
|
122 |
-
"device",
|
123 |
-
|
124 |
-
# Parent: NamedThing -> diagnostic_aid
|
125 |
-
"diagnostic_aid",
|
126 |
-
|
127 |
-
# Parent: NamedThing -> event
|
128 |
-
"event",
|
129 |
-
]
|
130 |
-
|
131 |
-
gliner_model = GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5")
|
132 |
-
|
133 |
-
def gliner_medical_ner(text, threshold=0.7):
|
134 |
-
entities = gliner_model.predict_entities(text, MEDICAL_LABELS, threshold=threshold)
|
135 |
-
return [{"text": ent["text"], "label": ent["label"]}
|
136 |
-
for ent in entities if len(ent["text"]) > 2] # Filter short fragments
|
137 |
-
return GLiNER, MEDICAL_LABELS, gliner_medical_ner, gliner_model
|
138 |
-
|
139 |
-
|
140 |
-
@app.cell
|
141 |
-
def _(gliner_medical_ner, pdf_text):
|
142 |
-
result_gli = gliner_medical_ner(pdf_text)
|
143 |
-
return (result_gli,)
|
144 |
-
|
145 |
-
|
146 |
-
@app.cell
|
147 |
-
def _(result_gli):
|
148 |
-
result_gli
|
149 |
-
return
|
150 |
-
|
151 |
-
|
152 |
-
@app.cell
|
153 |
-
def _():
|
154 |
-
return
|
155 |
-
|
156 |
-
|
157 |
-
if __name__ == "__main__":
|
158 |
-
app.run()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
j.json
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
Error: Could not connect to server at http://localhost:7860/api/v1/extract. Make sure the server is running.
|
|
|
|
answers.md β mds/answers.md
RENAMED
File without changes
|
learning.md β mds/learning.md
RENAMED
File without changes
|
manual_test_api.py β misc/manual_test_api.py
RENAMED
File without changes
|
tests/test_api.py
CHANGED
@@ -26,17 +26,17 @@ def test_extract_entities_invalid_file():
|
|
26 |
tmp.write(b"Not a PDF file")
|
27 |
tmp.seek(0)
|
28 |
response = client.post(
|
29 |
-
"/extract",
|
30 |
files={"file": ("test.txt", tmp, "text/plain")}
|
31 |
)
|
32 |
|
33 |
assert response.status_code == 415
|
34 |
-
assert "
|
35 |
|
36 |
def test_extract_entities_empty_file(test_pdf):
|
37 |
with open(test_pdf, "rb") as f:
|
38 |
response = client.post(
|
39 |
-
"/extract",
|
40 |
files={} # No file provided
|
41 |
)
|
42 |
|
|
|
26 |
tmp.write(b"Not a PDF file")
|
27 |
tmp.seek(0)
|
28 |
response = client.post(
|
29 |
+
"/api/v1/extract",
|
30 |
files={"file": ("test.txt", tmp, "text/plain")}
|
31 |
)
|
32 |
|
33 |
assert response.status_code == 415
|
34 |
+
assert "Unsupported file type." in response.json()["detail"]
|
35 |
|
36 |
def test_extract_entities_empty_file(test_pdf):
|
37 |
with open(test_pdf, "rb") as f:
|
38 |
response = client.post(
|
39 |
+
"/api/v1/extract",
|
40 |
files={} # No file provided
|
41 |
)
|
42 |
|