Delete pages
Browse files- pages/documentation.py +0 -107
pages/documentation.py
DELETED
@@ -1,107 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
|
3 |
-
# Set Streamlit page configuration
|
4 |
-
st.set_page_config(page_title="Documentation", layout="wide")
|
5 |
-
|
6 |
-
# Set up the Streamlit app layout
|
7 |
-
st.title("Documentation")
|
8 |
-
|
9 |
-
st.header("Dataset creation")
|
10 |
-
|
11 |
-
st.subheader(":blue[HAL API harvest]")
|
12 |
-
|
13 |
-
st.write("HAL is the french national open archive for scientific publications based on the principles of open access and self-archiving.")
|
14 |
-
st.write("All the API documentation is available [here](https://api.archives-ouvertes.fr/docs/search)")
|
15 |
-
st.write("All records of article type publications reported in the UNIV-COTEDAZUR collection of HAL are obtained with this looping function that populates a pandas Dataframe as output ")
|
16 |
-
st.code("""
|
17 |
-
# we retrieve first the total number of records
|
18 |
-
url_for_total_count = "https://api.archives-ouvertes.fr/search/UNIV-COTEDAZUR/?q=docType_s:ART&rows=0"
|
19 |
-
response = requests.request("GET", url_for_total_count).text
|
20 |
-
data = json.loads(response)
|
21 |
-
total_count = data["response"]["numFound"]
|
22 |
-
|
23 |
-
""", language='python')
|
24 |
-
st.code("""
|
25 |
-
step = 1000
|
26 |
-
df = []
|
27 |
-
for i in range(1, int(total_count), int(step)):
|
28 |
-
url = f"https://api.archives-ouvertes.fr/search/UNIV-COTEDAZUR/?q=docType_s:ART&rows={step}&start={i}&wt=csv&fl=uri_s,title_s,subTitle_s,authFullName_s,producedDate_s,domain_t,journalTitle_s,journalPublisher_s,abstract_s"
|
29 |
-
data = pd.read_csv(url, encoding="utf-8")
|
30 |
-
df.append(data)
|
31 |
-
df = pd.concat(df)
|
32 |
-
# clean up a little bit
|
33 |
-
df = df.drop_duplicates(subset=['uri_s'])
|
34 |
-
df = df.replace(np.nan, '')
|
35 |
-
""", language='python')
|
36 |
-
|
37 |
-
st.write("The dataframe's colmumns of metadata are then concatenated into a single combined text in a new column. It is therefore on this new column that the different embeddings models will be applied to encode this combined text and output a single vector embedding.")
|
38 |
-
st.code("""
|
39 |
-
df = df.astype(str)
|
40 |
-
df["combined"] = df.title_s + ". " + df.subTitle_s + ". " +df.abstract_s
|
41 |
-
""", language='python')
|
42 |
-
|
43 |
-
st.subheader(":blue[Huggingface open models for Embeddings]")
|
44 |
-
|
45 |
-
st.write("The open source Huggingface platform hosts a large number of pre-trained models that can then be reused for many tasks (text or image classification, summarization, document QA etc...). We can then use the sentence-transformers library applied on some of these available embedding pre-trained models for creating embeddings.")
|
46 |
-
st.write("There is two ways of working with the Huggingface hosted models : by using the [inference API endpoint](https://huggingface.co/inference-api) or by locally importing the model. Here we choose the second way")
|
47 |
-
st.write("Two open source transformers-based models have been used to convert the textual metadata into numerical vector representation, which generated two vector embeddings datasets : embeddings_all-MiniLM-L6-v2.pt and embeddings_multi-qa-mpnet-base-dot-v1.pt")
|
48 |
-
st.code("""
|
49 |
-
import torch
|
50 |
-
from sentence_transformers import SentenceTransformer
|
51 |
-
|
52 |
-
embedder = SentenceTransformer('all-MiniLM-L6-v2') # or 'multi-qa-mpnet-base-dot-v1'
|
53 |
-
|
54 |
-
corpus_embeddings = embedder.encode(df.combined, convert_to_tensor=True)
|
55 |
-
|
56 |
-
# how to save and reload
|
57 |
-
torch.save(corpus_embeddings, f"{LOCAL_PATH}/embeddings_all-MiniLM-L6-v2.pt")
|
58 |
-
corpus_embeddings = torch.load(f"{LOCAL_PATH}/embeddings_all-MiniLM-L6-v2.pt")
|
59 |
-
|
60 |
-
""", language='python')
|
61 |
-
|
62 |
-
st.subheader(":blue[Bonus : OpenAI Embeddings]")
|
63 |
-
|
64 |
-
st.write("If you want to do the same with text-embedding-ada-002 (the OpenAI embeddings model)")
|
65 |
-
|
66 |
-
st.code("""
|
67 |
-
import openai
|
68 |
-
import tiktoken
|
69 |
-
from openai.embeddings_utils import get_embedding
|
70 |
-
|
71 |
-
openai.api_key = os.getenv("OPENAI_API_KEY")
|
72 |
-
|
73 |
-
# embedding model parameters
|
74 |
-
embedding_model = "text-embedding-ada-002"
|
75 |
-
embedding_encoding = "cl100k_base" # this the encoding for text-embedding-ada-002
|
76 |
-
max_tokens = 8000 # the maximum for text-embedding-ada-002 is 8191
|
77 |
-
|
78 |
-
# filtering dataset on text under the max tokens limit
|
79 |
-
encoding = tiktoken.get_encoding(embedding_encoding)
|
80 |
-
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
|
81 |
-
df = df[df.n_tokens <= max_tokens]
|
82 |
-
|
83 |
-
# generate embeddings
|
84 |
-
def custom_get_embedding(text: str) -> list[float]:
|
85 |
-
return openai.Embedding.create(input=[text], model="text-embedding-ada-002")["data"][0]["embedding"]
|
86 |
-
df["openai_embedding"] = df.combined.apply(lambda x: custom_get_embedding(x) )
|
87 |
-
|
88 |
-
""", language='python')
|
89 |
-
|
90 |
-
st.write("And the Steamlit UI code would be :")
|
91 |
-
|
92 |
-
st.code("""
|
93 |
-
df["openai_embedding"] = df.openai_embedding.apply(literal_eval).apply(np.array)
|
94 |
-
def custom_get_embedding(text: str) -> list[float]:
|
95 |
-
return openai.Embedding.create(input=[text], model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY)["data"][0]["embedding"]
|
96 |
-
def openai_response(query):
|
97 |
-
query_embedding = np.array(custom_get_embedding(
|
98 |
-
query
|
99 |
-
))
|
100 |
-
df["similarity"] = df.openai_embedding.apply(lambda x: cosine_similarity(x, query_embedding))
|
101 |
-
return df.sort_values("similarity", ascending=False).head(5).to_json(orient="records")
|
102 |
-
|
103 |
-
""", language='python')
|
104 |
-
|
105 |
-
st.header("Dataset hosting")
|
106 |
-
|
107 |
-
st.write("The csv file of the dataset is avalaible in the data folder")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|