Spaces:
Sleeping
Sleeping
import marimo | |
__generated_with = "0.10.17" | |
app = marimo.App(width="medium") | |
def _(): | |
import marimo as mo | |
import pdfplumber | |
return mo, pdfplumber | |
def _(mo): | |
file = mo.ui.file_browser(initial_path="") | |
file | |
return (file,) | |
def _(file, pdfplumber): | |
with pdfplumber.open(file.path()) as pdf: | |
# Join all pages into single string | |
pdf_text = " ".join(p.extract_text() for p in pdf.pages) | |
return pdf, pdf_text | |
def _(): | |
from everycure.extract import extract_pdf_entities | |
return (extract_pdf_entities,) | |
def _(pdf_text): | |
from transformers import pipeline | |
from transformers import AutoTokenizer, AutoModelForTokenClassification | |
tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all") | |
model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all") | |
pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # pass device=0 if using gpu | |
result = pipe(pdf_text) | |
result | |
return ( | |
AutoModelForTokenClassification, | |
AutoTokenizer, | |
model, | |
pipe, | |
pipeline, | |
result, | |
tokenizer, | |
) | |
def _(): | |
from gliner import GLiNER | |
# Curated medical labels based on your domain needs | |
MEDICAL_LABELS = [ | |
# Parent: NamedThing -> biological_entity | |
"gene", | |
"protein", | |
"protein_isoform", | |
"cell", | |
"disease", | |
"phenotypic_feature", | |
"clinical_finding", | |
"anatomical_entity", | |
"pathway", | |
"biological_process", | |
# Parent: NamedThing -> chemical_entity | |
"drug", | |
"small_molecule", | |
"food_additive", | |
"chemical_mixture", | |
"molecular_entity", | |
# Parent: NamedThing -> clinical_entity | |
"clinical_intervention", | |
"clinical_trial", | |
"hospitalization", | |
# Parent: NamedThing -> planetary_entity | |
"geographic_location", | |
"environmental_feature", | |
"environmental_process", | |
# Parent: NamedThing -> information_content_entity | |
"publication", | |
"journal_article", | |
"book", | |
"patent", | |
"dataset", | |
"study_result", | |
# Parent: NamedThing -> organismal_entity | |
"human", | |
"mammal", | |
"plant", | |
"virus", | |
"bacterium", | |
"cell_line", | |
# Parent: NamedThing -> attribute | |
"biological_sex", | |
"clinical_attribute", | |
"socioeconomic_attribute", | |
"environmental_exposure", | |
"drug_exposure", | |
# Parent: NamedThing -> procedure | |
"procedure", | |
# Parent: NamedThing -> treatment | |
"treatment", | |
# Parent: NamedThing -> device | |
"device", | |
# Parent: NamedThing -> diagnostic_aid | |
"diagnostic_aid", | |
# Parent: NamedThing -> event | |
"event", | |
] | |
gliner_model = GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5") | |
def gliner_medical_ner(text, threshold=0.7): | |
entities = gliner_model.predict_entities(text, MEDICAL_LABELS, threshold=threshold) | |
return [{"text": ent["text"], "label": ent["label"]} | |
for ent in entities if len(ent["text"]) > 2] # Filter short fragments | |
return GLiNER, MEDICAL_LABELS, gliner_medical_ner, gliner_model | |
def _(gliner_medical_ner, pdf_text): | |
result_gli = gliner_medical_ner(pdf_text) | |
return (result_gli,) | |
def _(result_gli): | |
result_gli | |
return | |
def _(): | |
return | |
if __name__ == "__main__": | |
app.run() | |