Luis Chaves commited on
Commit
70b960d
Β·
1 Parent(s): 550c7ec

moved things around, cleaned up files, final debug of dokcerfile

Browse files
.DS_Store ADDED
Binary file (8.2 kB). View file
 
Dockerfile CHANGED
@@ -4,6 +4,7 @@ FROM python:3.12-slim
4
  RUN useradd -m appuser
5
  USER appuser
6
  ENV HOME=/home/appuser
 
7
  WORKDIR /code
8
 
9
  COPY --chown=appuser:appuser ./pyproject.toml /code/
 
4
  RUN useradd -m appuser
5
  USER appuser
6
  ENV HOME=/home/appuser
7
+ ENV PATH="${HOME}/.local/bin:${PATH}"
8
  WORKDIR /code
9
 
10
  COPY --chown=appuser:appuser ./pyproject.toml /code/
explore.py DELETED
@@ -1,158 +0,0 @@
1
- import marimo
2
-
3
- __generated_with = "0.10.17"
4
- app = marimo.App(width="medium")
5
-
6
-
7
- @app.cell
8
- def _():
9
- import marimo as mo
10
- import pdfplumber
11
- return mo, pdfplumber
12
-
13
-
14
- @app.cell
15
- def _(mo):
16
- file = mo.ui.file_browser(initial_path="")
17
- file
18
- return (file,)
19
-
20
-
21
- @app.cell
22
- def _(file, pdfplumber):
23
- with pdfplumber.open(file.path()) as pdf:
24
- # Join all pages into single string
25
- pdf_text = " ".join(p.extract_text() for p in pdf.pages)
26
- return pdf, pdf_text
27
-
28
-
29
- @app.cell
30
- def _():
31
- from everycure.extract import extract_pdf_entities
32
- return (extract_pdf_entities,)
33
-
34
-
35
- @app.cell
36
- def _(pdf_text):
37
- from transformers import pipeline
38
- from transformers import AutoTokenizer, AutoModelForTokenClassification
39
-
40
- tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
41
- model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")
42
-
43
- pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # pass device=0 if using gpu
44
- result = pipe(pdf_text)
45
- result
46
- return (
47
- AutoModelForTokenClassification,
48
- AutoTokenizer,
49
- model,
50
- pipe,
51
- pipeline,
52
- result,
53
- tokenizer,
54
- )
55
-
56
-
57
- @app.cell
58
- def _():
59
- from gliner import GLiNER
60
-
61
- # Curated medical labels based on your domain needs
62
- MEDICAL_LABELS = [
63
- # Parent: NamedThing -> biological_entity
64
- "gene",
65
- "protein",
66
- "protein_isoform",
67
- "cell",
68
- "disease",
69
- "phenotypic_feature",
70
- "clinical_finding",
71
- "anatomical_entity",
72
- "pathway",
73
- "biological_process",
74
-
75
- # Parent: NamedThing -> chemical_entity
76
- "drug",
77
- "small_molecule",
78
- "food_additive",
79
- "chemical_mixture",
80
- "molecular_entity",
81
-
82
- # Parent: NamedThing -> clinical_entity
83
- "clinical_intervention",
84
- "clinical_trial",
85
- "hospitalization",
86
-
87
- # Parent: NamedThing -> planetary_entity
88
- "geographic_location",
89
- "environmental_feature",
90
- "environmental_process",
91
-
92
- # Parent: NamedThing -> information_content_entity
93
- "publication",
94
- "journal_article",
95
- "book",
96
- "patent",
97
- "dataset",
98
- "study_result",
99
-
100
- # Parent: NamedThing -> organismal_entity
101
- "human",
102
- "mammal",
103
- "plant",
104
- "virus",
105
- "bacterium",
106
- "cell_line",
107
-
108
- # Parent: NamedThing -> attribute
109
- "biological_sex",
110
- "clinical_attribute",
111
- "socioeconomic_attribute",
112
- "environmental_exposure",
113
- "drug_exposure",
114
-
115
- # Parent: NamedThing -> procedure
116
- "procedure",
117
-
118
- # Parent: NamedThing -> treatment
119
- "treatment",
120
-
121
- # Parent: NamedThing -> device
122
- "device",
123
-
124
- # Parent: NamedThing -> diagnostic_aid
125
- "diagnostic_aid",
126
-
127
- # Parent: NamedThing -> event
128
- "event",
129
- ]
130
-
131
- gliner_model = GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5")
132
-
133
- def gliner_medical_ner(text, threshold=0.7):
134
- entities = gliner_model.predict_entities(text, MEDICAL_LABELS, threshold=threshold)
135
- return [{"text": ent["text"], "label": ent["label"]}
136
- for ent in entities if len(ent["text"]) > 2] # Filter short fragments
137
- return GLiNER, MEDICAL_LABELS, gliner_medical_ner, gliner_model
138
-
139
-
140
- @app.cell
141
- def _(gliner_medical_ner, pdf_text):
142
- result_gli = gliner_medical_ner(pdf_text)
143
- return (result_gli,)
144
-
145
-
146
- @app.cell
147
- def _(result_gli):
148
- result_gli
149
- return
150
-
151
-
152
- @app.cell
153
- def _():
154
- return
155
-
156
-
157
- if __name__ == "__main__":
158
- app.run()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
j.json DELETED
@@ -1 +0,0 @@
1
- Error: Could not connect to server at http://localhost:7860/api/v1/extract. Make sure the server is running.
 
 
answers.md β†’ mds/answers.md RENAMED
File without changes
learning.md β†’ mds/learning.md RENAMED
File without changes
manual_test_api.py β†’ misc/manual_test_api.py RENAMED
File without changes
tests/test_api.py CHANGED
@@ -26,17 +26,17 @@ def test_extract_entities_invalid_file():
26
  tmp.write(b"Not a PDF file")
27
  tmp.seek(0)
28
  response = client.post(
29
- "/extract",
30
  files={"file": ("test.txt", tmp, "text/plain")}
31
  )
32
 
33
  assert response.status_code == 415
34
- assert "Invalid file type" in response.json()["detail"]
35
 
36
  def test_extract_entities_empty_file(test_pdf):
37
  with open(test_pdf, "rb") as f:
38
  response = client.post(
39
- "/extract",
40
  files={} # No file provided
41
  )
42
 
 
26
  tmp.write(b"Not a PDF file")
27
  tmp.seek(0)
28
  response = client.post(
29
+ "/api/v1/extract",
30
  files={"file": ("test.txt", tmp, "text/plain")}
31
  )
32
 
33
  assert response.status_code == 415
34
+ assert "Unsupported file type." in response.json()["detail"]
35
 
36
  def test_extract_entities_empty_file(test_pdf):
37
  with open(test_pdf, "rb") as f:
38
  response = client.post(
39
+ "/api/v1/extract",
40
  files={} # No file provided
41
  )
42