everycure-ner-pdf / explore.py
Luis Chaves
first commit
73b49a2
raw
history blame
3.66 kB
import marimo
__generated_with = "0.10.17"
app = marimo.App(width="medium")
@app.cell
def _():
import marimo as mo
import pdfplumber
return mo, pdfplumber
@app.cell
def _(mo):
file = mo.ui.file_browser(initial_path="")
file
return (file,)
@app.cell
def _(file, pdfplumber):
with pdfplumber.open(file.path()) as pdf:
# Join all pages into single string
pdf_text = " ".join(p.extract_text() for p in pdf.pages)
return pdf, pdf_text
@app.cell
def _():
from everycure.extract import extract_pdf_entities
return (extract_pdf_entities,)
@app.cell
def _(pdf_text):
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification
tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")
pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # pass device=0 if using gpu
result = pipe(pdf_text)
result
return (
AutoModelForTokenClassification,
AutoTokenizer,
model,
pipe,
pipeline,
result,
tokenizer,
)
@app.cell
def _():
from gliner import GLiNER
# Curated medical labels based on your domain needs
MEDICAL_LABELS = [
# Parent: NamedThing -> biological_entity
"gene",
"protein",
"protein_isoform",
"cell",
"disease",
"phenotypic_feature",
"clinical_finding",
"anatomical_entity",
"pathway",
"biological_process",
# Parent: NamedThing -> chemical_entity
"drug",
"small_molecule",
"food_additive",
"chemical_mixture",
"molecular_entity",
# Parent: NamedThing -> clinical_entity
"clinical_intervention",
"clinical_trial",
"hospitalization",
# Parent: NamedThing -> planetary_entity
"geographic_location",
"environmental_feature",
"environmental_process",
# Parent: NamedThing -> information_content_entity
"publication",
"journal_article",
"book",
"patent",
"dataset",
"study_result",
# Parent: NamedThing -> organismal_entity
"human",
"mammal",
"plant",
"virus",
"bacterium",
"cell_line",
# Parent: NamedThing -> attribute
"biological_sex",
"clinical_attribute",
"socioeconomic_attribute",
"environmental_exposure",
"drug_exposure",
# Parent: NamedThing -> procedure
"procedure",
# Parent: NamedThing -> treatment
"treatment",
# Parent: NamedThing -> device
"device",
# Parent: NamedThing -> diagnostic_aid
"diagnostic_aid",
# Parent: NamedThing -> event
"event",
]
gliner_model = GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5")
def gliner_medical_ner(text, threshold=0.7):
entities = gliner_model.predict_entities(text, MEDICAL_LABELS, threshold=threshold)
return [{"text": ent["text"], "label": ent["label"]}
for ent in entities if len(ent["text"]) > 2] # Filter short fragments
return GLiNER, MEDICAL_LABELS, gliner_medical_ner, gliner_model
@app.cell
def _(gliner_medical_ner, pdf_text):
result_gli = gliner_medical_ner(pdf_text)
return (result_gli,)
@app.cell
def _(result_gli):
result_gli
return
@app.cell
def _():
return
if __name__ == "__main__":
app.run()