File size: 2,842 Bytes
672f445 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
"""
Script for evaluating Jina Embedding Models on the MTEB benchmark.
This script is based on the MTEB example:
https://github.com/embeddings-benchmark/mteb/blob/main/scripts/run_mteb_english.py
"""
import logging
from mteb import MTEB
from sentence_transformers import SentenceTransformer
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("main")
TASK_LIST_CLASSIFICATION = [
"AmazonCounterfactualClassification",
"AmazonPolarityClassification",
"AmazonReviewsClassification",
"Banking77Classification",
"EmotionClassification",
"ImdbClassification",
"MassiveIntentClassification",
"MassiveScenarioClassification",
"MTOPDomainClassification",
"MTOPIntentClassification",
"ToxicConversationsClassification",
"TweetSentimentExtractionClassification",
]
TASK_LIST_CLUSTERING = [
"ArxivClusteringP2P",
"ArxivClusteringS2S",
"BiorxivClusteringP2P",
"BiorxivClusteringS2S",
"MedrxivClusteringP2P",
"MedrxivClusteringS2S",
"RedditClustering",
"RedditClusteringP2P",
"StackExchangeClustering",
"StackExchangeClusteringP2P",
"TwentyNewsgroupsClustering",
]
TASK_LIST_PAIR_CLASSIFICATION = [
"SprintDuplicateQuestions",
"TwitterSemEval2015",
"TwitterURLCorpus",
]
TASK_LIST_RERANKING = [
"AskUbuntuDupQuestions",
"MindSmallReranking",
"SciDocsRR",
"StackOverflowDupQuestions",
]
TASK_LIST_RETRIEVAL = [
"ArguAna",
"ClimateFEVER",
"CQADupstackAndroidRetrieval",
"CQADupstackEnglishRetrieval",
"CQADupstackGamingRetrieval",
"CQADupstackGisRetrieval",
"CQADupstackMathematicaRetrieval",
"CQADupstackPhysicsRetrieval",
"CQADupstackProgrammersRetrieval",
"CQADupstackStatsRetrieval",
"CQADupstackTexRetrieval",
"CQADupstackUnixRetrieval",
"CQADupstackWebmastersRetrieval",
"CQADupstackWordpressRetrieval",
"DBPedia",
"FEVER",
"FiQA2018",
"HotpotQA",
"MSMARCO",
"NFCorpus",
"NQ",
"QuoraRetrieval",
"SCIDOCS",
"SciFact",
"Touche2020",
"TRECCOVID",
]
TASK_LIST_STS = [
"BIOSSES",
"SICK-R",
"STS12",
"STS13",
"STS14",
"STS15",
"STS16",
"STS17",
"STS22",
"STSBenchmark",
"SummEval",
]
TASK_LIST = (
TASK_LIST_CLASSIFICATION
+ TASK_LIST_CLUSTERING
+ TASK_LIST_PAIR_CLASSIFICATION
+ TASK_LIST_RERANKING
+ TASK_LIST_RETRIEVAL
+ TASK_LIST_STS
)
model_name = "jinaai/jina-embedding-l-en-v1"
model = SentenceTransformer(model_name)
for task in TASK_LIST:
logger.info(f"Running task: {task}")
eval_splits = ["dev"] if task == "MSMARCO" else ["test"]
evaluation = MTEB(tasks=[task], task_langs=["en"]) # Remove "en" for running all languages
evaluation.run(model, output_folder=f"results/{model_name}", eval_splits=eval_splits)
|