Spaces:

lucharo
/

everycure-ner-pdf

Sleeping

Luis Chaves

first commit

73b49a2 11 days ago

3.66 kB

	import marimo

	__generated_with = "0.10.17"
	app = marimo.App(width="medium")


	@app.cell
	def _():
	import marimo as mo
	import pdfplumber
	return mo, pdfplumber


	@app.cell
	def _(mo):
	file = mo.ui.file_browser(initial_path="")
	file
	return (file,)


	@app.cell
	def _(file, pdfplumber):
	with pdfplumber.open(file.path()) as pdf:
	# Join all pages into single string
	pdf_text = " ".join(p.extract_text() for p in pdf.pages)
	return pdf, pdf_text


	@app.cell
	def _():
	from everycure.extract import extract_pdf_entities
	return (extract_pdf_entities,)


	@app.cell
	def _(pdf_text):
	from transformers import pipeline
	from transformers import AutoTokenizer, AutoModelForTokenClassification

	tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
	model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")

	pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # pass device=0 if using gpu
	result = pipe(pdf_text)
	result
	return (
	AutoModelForTokenClassification,
	AutoTokenizer,
	model,
	pipe,
	pipeline,
	result,
	tokenizer,
	)


	@app.cell
	def _():
	from gliner import GLiNER

	# Curated medical labels based on your domain needs
	MEDICAL_LABELS = [
	# Parent: NamedThing -> biological_entity
	"gene",
	"protein",
	"protein_isoform",
	"cell",
	"disease",
	"phenotypic_feature",
	"clinical_finding",
	"anatomical_entity",
	"pathway",
	"biological_process",

	# Parent: NamedThing -> chemical_entity
	"drug",
	"small_molecule",
	"food_additive",
	"chemical_mixture",
	"molecular_entity",

	# Parent: NamedThing -> clinical_entity
	"clinical_intervention",
	"clinical_trial",
	"hospitalization",

	# Parent: NamedThing -> planetary_entity
	"geographic_location",
	"environmental_feature",
	"environmental_process",

	# Parent: NamedThing -> information_content_entity
	"publication",
	"journal_article",
	"book",
	"patent",
	"dataset",
	"study_result",

	# Parent: NamedThing -> organismal_entity
	"human",
	"mammal",
	"plant",
	"virus",
	"bacterium",
	"cell_line",

	# Parent: NamedThing -> attribute
	"biological_sex",
	"clinical_attribute",
	"socioeconomic_attribute",
	"environmental_exposure",
	"drug_exposure",

	# Parent: NamedThing -> procedure
	"procedure",

	# Parent: NamedThing -> treatment
	"treatment",

	# Parent: NamedThing -> device
	"device",

	# Parent: NamedThing -> diagnostic_aid
	"diagnostic_aid",

	# Parent: NamedThing -> event
	"event",
	]

	gliner_model = GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5")

	def gliner_medical_ner(text, threshold=0.7):
	entities = gliner_model.predict_entities(text, MEDICAL_LABELS, threshold=threshold)
	return [{"text": ent["text"], "label": ent["label"]}
	for ent in entities if len(ent["text"]) > 2] # Filter short fragments
	return GLiNER, MEDICAL_LABELS, gliner_medical_ner, gliner_model


	@app.cell
	def _(gliner_medical_ner, pdf_text):
	result_gli = gliner_medical_ner(pdf_text)
	return (result_gli,)


	@app.cell
	def _(result_gli):
	result_gli
	return


	@app.cell
	def _():
	return


	if __name__ == "__main__":
	app.run()