import gradio as gr import zipfile import os import tempfile import pandas as pd import spacy import subprocess # Ensure the spaCy French model is downloaded try: nlp = spacy.load("fr_core_news_sm") except OSError: print("Downloading spaCy 'fr_core_news_sm' model...") subprocess.run(["python", "-m", "spacy", "download", "fr_core_news_sm"]) nlp = spacy.load("fr_core_news_sm") # Function to lemmatize text using spaCy def lemmatize_text(text): doc = nlp(text) return " ".join([token.lemma_ for token in doc]) # Global variables to store the corpus raw_corpus = {} # To store raw texts lemmatized_corpus = {} # To store lemmatized texts initial_df = pd.DataFrame() # Function to process the zip file, lemmatize text, get document names, and calculate word counts def process_zip_initial(zip_file): global raw_corpus, lemmatized_corpus, initial_df # To store the raw texts, lemmatized texts, and DataFrame raw_corpus = {} lemmatized_corpus = {} # Reset the corpus on new upload # Create a temporary directory to extract files with tempfile.TemporaryDirectory() as temp_dir: # Extract the zip file with zipfile.ZipFile(zip_file.name, 'r') as zip_ref: zip_ref.extractall(temp_dir) # Recursively get list of all .txt files in all directories and lemmatize the text txt_files = [] word_counts = [] for root, dirs, files in os.walk(temp_dir): for file in files: if file.endswith('.txt'): file_path = os.path.join(root, file) txt_files.append(os.path.basename(file_path)) # Only the file name # Read the text with open(file_path, 'r', encoding='utf-8') as f: text = f.read() word_count = len(text.split()) # Split text by spaces to count words word_counts.append(word_count) # Store raw text in raw_corpus raw_corpus[os.path.basename(file_path)] = text.lower() # Lemmatize the text and store in lemmatized_corpus lemmatized_text = lemmatize_text(text.lower()) lemmatized_corpus[os.path.basename(file_path)] = lemmatized_text # Create a DataFrame with document names and word counts initial_df = pd.DataFrame({"Nom du document": txt_files, "N. mots": word_counts}) return initial_df # Function to search for keywords in the selected corpus (raw or lemmatized) def process_zip_and_search(keywords_text, search_mode): global raw_corpus, lemmatized_corpus, initial_df # Use the texts stored at corpus upload and initial DataFrame # Read the keywords (no lemmatization of keywords) keywords = [keyword.strip().lower() for keyword in keywords_text.strip().split("\n") if keyword.strip()] if not keywords: # If no keywords are provided, return the initial DataFrame (without the keyword columns) return initial_df # Select the appropriate corpus based on the search mode corpus = lemmatized_corpus if search_mode == "Lemmes" else raw_corpus # Prepare a dictionary to store the results (initialize with Document Name and empty results) results = {doc_name: {keyword: "" for keyword in keywords} for doc_name in corpus.keys()} # Search for keyword frequencies in each text file for doc_name, text in corpus.items(): for keyword in keywords: keyword_count = text.count(keyword) # Count occurrences of each keyword if keyword_count > 0: results[doc_name][keyword] = keyword_count # Convert the results dictionary to a DataFrame df_keywords = pd.DataFrame(results).T # Transpose to have files as rows and keywords as columns # Reset index to make the document names a column df_keywords.reset_index(inplace=True) # Rename the first column to 'Nom du document' df_keywords.rename(columns={"index": "Nom du document"}, inplace=True) # Replace 0 frequencies with empty strings df_keywords.replace(0, "", inplace=True) # Merge the initial DataFrame with the keyword search results final_df = pd.merge(initial_df, df_keywords, on="Nom du document", how="left") return final_df # Function to export the DataFrame to Excel def export_to_excel(df): # Create a temporary directory for storing the Excel file with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp: excel_path = tmp.name # Save the DataFrame to Excel df.to_excel(excel_path, index=False) return excel_path # Create Gradio interface with one results table and export functionality with gr.Blocks() as demo: gr.Markdown("# Recherche simple par mots-clés avec lemmatisation") # This line adds the title with gr.Row(): # File upload and initial table with document names zip_file_input = gr.File(label="Téléversez votre dossier .zip contenant les fichiers texte (format .txt)") with gr.Row(): # Textbox for entering keywords keywords_input = gr.Textbox(label="Entrez les mots clés (un par ligne, peuvent contenir plus d'un mot)", placeholder="mots-clés...", lines=10) with gr.Row(): # Switch button to select between raw tokens and lemmatized search search_mode = gr.Radio(label="Choisissez le type de recherche", choices=["Mots", "Lemmes"], value="Lemmes") with gr.Row(): # Button to trigger keyword search search_button = gr.Button("Recherche") # Output the final results table after the search button with gr.Row(): result_table = gr.DataFrame(label="Résultats", col_count=(1, "dynamic"), interactive=False) # Disable renaming/editing # Button to trigger the Excel export with gr.Row(): export_button = gr.Button("Exporter vers Excel (.xlsx)") download_link = gr.File(label="Télécharger le fichier") # Action to display document names and lemmatized text upon ZIP upload zip_file_input.change(fn=process_zip_initial, inputs=zip_file_input, outputs=result_table) # Action to update the table with keywords and results based on the selected search mode search_button.click(fn=process_zip_and_search, inputs=[keywords_input, search_mode], outputs=result_table) # Action to export the results to Excel export_button.click(fn=export_to_excel, inputs=result_table, outputs=download_link) # Launch the app demo.launch()