Spaces:

lucharo
/

everycure-ner-pdf

Sleeping

App Files Files Community

Luis Chaves commited on 10 days ago

Commit

70b960d

1 Parent(s): 550c7ec

moved things around, cleaned up files, final debug of dokcerfile

Browse files

Files changed (8) hide show

.DS_Store +0 -0
Dockerfile +1 -0
explore.py +0 -158
j.json +0 -1
answers.md → mds/answers.md +0 -0
learning.md → mds/learning.md +0 -0
manual_test_api.py → misc/manual_test_api.py +0 -0
tests/test_api.py +3 -3

.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

Dockerfile CHANGED Viewed

@@ -4,6 +4,7 @@ FROM python:3.12-slim
 RUN useradd -m appuser
 USER appuser
 ENV HOME=/home/appuser
 WORKDIR /code
 COPY --chown=appuser:appuser ./pyproject.toml /code/

 RUN useradd -m appuser
 USER appuser
 ENV HOME=/home/appuser
+ENV PATH="${HOME}/.local/bin:${PATH}"
 WORKDIR /code
 COPY --chown=appuser:appuser ./pyproject.toml /code/

explore.py DELETED Viewed

@@ -1,158 +0,0 @@
-import marimo
-__generated_with = "0.10.17"
-app = marimo.App(width="medium")
-@app.cell
-def _():
-    import marimo as mo
-    import pdfplumber
-    return mo, pdfplumber
-@app.cell
-def _(mo):
-    file = mo.ui.file_browser(initial_path="")
-    file
-    return (file,)
-@app.cell
-def _(file, pdfplumber):
-    with pdfplumber.open(file.path()) as pdf:
-        # Join all pages into single string
-        pdf_text = " ".join(p.extract_text() for p in pdf.pages)
-    return pdf, pdf_text
-@app.cell
-def _():
-    from everycure.extract import extract_pdf_entities
-    return (extract_pdf_entities,)
-@app.cell
-def _(pdf_text):
-    from transformers import pipeline
-    from transformers import AutoTokenizer, AutoModelForTokenClassification
-    tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
-    model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")
-    pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # pass device=0 if using gpu
-    result = pipe(pdf_text)
-    result
-    return (
-        AutoModelForTokenClassification,
-        AutoTokenizer,
-        model,
-        pipe,
-        pipeline,
-        result,
-        tokenizer,
-    )
-@app.cell
-def _():
-    from gliner import GLiNER
-    # Curated medical labels based on your domain needs
-    MEDICAL_LABELS = [
-        # Parent: NamedThing -> biological_entity
-        "gene",
-        "protein",
-        "protein_isoform",
-        "cell",
-        "disease",
-        "phenotypic_feature",
-        "clinical_finding",
-        "anatomical_entity",
-        "pathway",
-        "biological_process",
-        # Parent: NamedThing -> chemical_entity
-        "drug",
-        "small_molecule",
-        "food_additive",
-        "chemical_mixture",
-        "molecular_entity",
-        # Parent: NamedThing -> clinical_entity
-        "clinical_intervention",
-        "clinical_trial",
-        "hospitalization",
-        # Parent: NamedThing -> planetary_entity
-        "geographic_location",
-        "environmental_feature",
-        "environmental_process",
-        # Parent: NamedThing -> information_content_entity
-        "publication",
-        "journal_article",
-        "book",
-        "patent",
-        "dataset",
-        "study_result",
-        # Parent: NamedThing -> organismal_entity
-        "human",
-        "mammal",
-        "plant",
-        "virus",
-        "bacterium",
-        "cell_line",
-        # Parent: NamedThing -> attribute
-        "biological_sex",
-        "clinical_attribute",
-        "socioeconomic_attribute",
-        "environmental_exposure",
-        "drug_exposure",
-        # Parent: NamedThing -> procedure
-        "procedure",
-        # Parent: NamedThing -> treatment
-        "treatment",
-        # Parent: NamedThing -> device
-        "device",
-        # Parent: NamedThing -> diagnostic_aid
-        "diagnostic_aid",
-        # Parent: NamedThing -> event
-        "event",
-    ]
-    gliner_model = GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5")
-    def gliner_medical_ner(text, threshold=0.7):
-        entities = gliner_model.predict_entities(text, MEDICAL_LABELS, threshold=threshold)
-        return [{"text": ent["text"], "label": ent["label"]}
-                for ent in entities if len(ent["text"]) > 2]  # Filter short fragments
-    return GLiNER, MEDICAL_LABELS, gliner_medical_ner, gliner_model
-@app.cell
-def _(gliner_medical_ner, pdf_text):
-    result_gli = gliner_medical_ner(pdf_text)
-    return (result_gli,)
-@app.cell
-def _(result_gli):
-    result_gli
-    return
-@app.cell
-def _():
-    return
-if __name__ == "__main__":
-    app.run()

j.json DELETED Viewed

	@@ -1 +0,0 @@
1	- Error: Could not connect to server at http://localhost:7860/api/v1/extract. Make sure the server is running.

answers.md → mds/answers.md RENAMED Viewed

File without changes

learning.md → mds/learning.md RENAMED Viewed

File without changes

manual_test_api.py → misc/manual_test_api.py RENAMED Viewed

File without changes

tests/test_api.py CHANGED Viewed

@@ -26,17 +26,17 @@ def test_extract_entities_invalid_file():
         tmp.write(b"Not a PDF file")
         tmp.seek(0)
         response = client.post(
-            "/extract",
             files={"file": ("test.txt", tmp, "text/plain")}
         )
     assert response.status_code == 415
-    assert "Invalid file type" in response.json()["detail"]
 def test_extract_entities_empty_file(test_pdf):
     with open(test_pdf, "rb") as f:
         response = client.post(
-            "/extract",
             files={}  # No file provided
         )

         tmp.write(b"Not a PDF file")
         tmp.seek(0)
         response = client.post(
+            "/api/v1/extract",
             files={"file": ("test.txt", tmp, "text/plain")}
         )
     assert response.status_code == 415
+    assert "Unsupported file type." in response.json()["detail"]
 def test_extract_entities_empty_file(test_pdf):
     with open(test_pdf, "rb") as f:
         response = client.post(
+            "/api/v1/extract",
             files={}  # No file provided
         )