import arxiv import gradio as gr import pandas as pd from apscheduler.schedulers.background import BackgroundScheduler from cachetools import TTLCache, cached from setfit import SetFitModel from tqdm.auto import tqdm CACHE_TIME = 60 * 60 * 12 MAX_RESULTS = 30_000 @cached(cache=TTLCache(maxsize=10, ttl=CACHE_TIME)) def get_arxiv_result(): search = arxiv.Search( query="ti:dataset AND abs:machine learning", max_results=MAX_RESULTS, sort_by=arxiv.SortCriterion.SubmittedDate, ) return [ { "title": result.title, "abstract": result.summary, "url": result.entry_id, "category": result.primary_category, "updated": result.updated, } for result in tqdm(search.results(), total=MAX_RESULTS) ] def load_model(): return SetFitModel.from_pretrained("librarian-bots/is_new_dataset_teacher_model") def format_row_for_model(row): return f"TITLE: {row['title']} \n\nABSTRACT: {row['abstract']}" int2label = {0: "new_dataset", 1: "not_new_dataset"} def get_predictions(data: list[dict], model=None, batch_size=32): if model is None: model = load_model() predictions = [] for i in tqdm(range(0, len(data), batch_size)): batch = data[i : i + batch_size] text_inputs = [format_row_for_model(row) for row in batch] batch_predictions = model.predict_proba(text_inputs) for j, row in enumerate(batch): prediction = batch_predictions[j] row["prediction"] = int2label[int(prediction.argmax())] row["probability"] = float(prediction.max()) predictions.append(row) return predictions def create_markdown(row): title = row["title"] abstract = row["abstract"] arxiv_id = row["arxiv_id"] hub_paper_url = f"https://huggingface.co/papers/{arxiv_id}" updated = row["updated"] updated = updated.strftime("%Y-%m-%d") broad_category = row["broad_category"] category = row["category"] return f"""