|
import gradio as gr |
|
import zipfile |
|
import os |
|
import tempfile |
|
import pandas as pd |
|
import spacy |
|
import subprocess |
|
|
|
|
|
try: |
|
nlp = spacy.load("fr_core_news_sm") |
|
except OSError: |
|
print("Downloading spaCy 'fr_core_news_sm' model...") |
|
subprocess.run(["python", "-m", "spacy", "download", "fr_core_news_sm"]) |
|
nlp = spacy.load("fr_core_news_sm") |
|
|
|
|
|
def lemmatize_text(text): |
|
doc = nlp(text) |
|
return " ".join([token.lemma_ for token in doc]) |
|
|
|
|
|
raw_corpus = {} |
|
lemmatized_corpus = {} |
|
initial_df = pd.DataFrame() |
|
|
|
|
|
def process_zip_initial(zip_file): |
|
global raw_corpus, lemmatized_corpus, initial_df |
|
raw_corpus = {} |
|
lemmatized_corpus = {} |
|
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir: |
|
|
|
with zipfile.ZipFile(zip_file.name, 'r') as zip_ref: |
|
zip_ref.extractall(temp_dir) |
|
|
|
|
|
txt_files = [] |
|
word_counts = [] |
|
for root, dirs, files in os.walk(temp_dir): |
|
for file in files: |
|
if file.endswith('.txt'): |
|
file_path = os.path.join(root, file) |
|
txt_files.append(os.path.basename(file_path)) |
|
|
|
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
text = f.read() |
|
word_count = len(text.split()) |
|
word_counts.append(word_count) |
|
|
|
|
|
raw_corpus[os.path.basename(file_path)] = text.lower() |
|
|
|
|
|
lemmatized_text = lemmatize_text(text.lower()) |
|
lemmatized_corpus[os.path.basename(file_path)] = lemmatized_text |
|
|
|
|
|
initial_df = pd.DataFrame({"Nom du document": txt_files, "N. mots": word_counts}) |
|
|
|
return initial_df |
|
|
|
|
|
def process_zip_and_search(keywords_text, search_mode): |
|
global raw_corpus, lemmatized_corpus, initial_df |
|
|
|
|
|
keywords = [keyword.strip().lower() for keyword in keywords_text.strip().split("\n") if keyword.strip()] |
|
|
|
if not keywords: |
|
|
|
return initial_df |
|
|
|
|
|
corpus = lemmatized_corpus if search_mode == "Lemmes" else raw_corpus |
|
|
|
|
|
results = {doc_name: {keyword: "" for keyword in keywords} for doc_name in corpus.keys()} |
|
|
|
|
|
for doc_name, text in corpus.items(): |
|
for keyword in keywords: |
|
keyword_count = text.count(keyword) |
|
if keyword_count > 0: |
|
results[doc_name][keyword] = keyword_count |
|
|
|
|
|
df_keywords = pd.DataFrame(results).T |
|
|
|
|
|
df_keywords.reset_index(inplace=True) |
|
|
|
|
|
df_keywords.rename(columns={"index": "Nom du document"}, inplace=True) |
|
|
|
|
|
df_keywords.replace(0, "", inplace=True) |
|
|
|
|
|
final_df = pd.merge(initial_df, df_keywords, on="Nom du document", how="left") |
|
|
|
return final_df |
|
|
|
|
|
|
|
def export_to_excel(df): |
|
|
|
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp: |
|
excel_path = tmp.name |
|
|
|
df.to_excel(excel_path, index=False) |
|
return excel_path |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# Recherche simple par mots-clés avec lemmatisation") |
|
|
|
with gr.Row(): |
|
|
|
zip_file_input = gr.File(label="Téléversez votre dossier .zip contenant les fichiers texte (format .txt)") |
|
|
|
with gr.Row(): |
|
|
|
keywords_input = gr.Textbox(label="Entrez les mots clés (un par ligne, peuvent contenir plus d'un mot)", placeholder="mots-clés...", lines=10) |
|
|
|
with gr.Row(): |
|
|
|
search_mode = gr.Radio(label="Choisissez le type de recherche", choices=["Mots", "Lemmes"], value="Lemmes") |
|
|
|
with gr.Row(): |
|
|
|
search_button = gr.Button("Recherche") |
|
|
|
|
|
with gr.Row(): |
|
result_table = gr.DataFrame(label="Résultats", col_count=(1, "dynamic"), interactive=False) |
|
|
|
|
|
with gr.Row(): |
|
export_button = gr.Button("Exporter vers Excel (.xlsx)") |
|
download_link = gr.File(label="Télécharger le fichier") |
|
|
|
|
|
zip_file_input.change(fn=process_zip_initial, inputs=zip_file_input, outputs=result_table) |
|
|
|
|
|
search_button.click(fn=process_zip_and_search, inputs=[keywords_input, search_mode], outputs=result_table) |
|
|
|
|
|
export_button.click(fn=export_to_excel, inputs=result_table, outputs=download_link) |
|
|
|
|
|
demo.launch() |