Spaces:

gloignon
/

taupinet

Runtime error

App Files Files Community

taupinet / app.py

gloignon

fixed typo in excel_path

f3eaee0 verified 4 months ago

raw

history blame contribute delete

6.65 kB

	import gradio as gr
	import zipfile
	import os
	import tempfile
	import pandas as pd
	import spacy
	import subprocess

	# Ensure the spaCy French model is downloaded
	try:
	nlp = spacy.load("fr_core_news_sm")
	except OSError:
	print("Downloading spaCy 'fr_core_news_sm' model...")
	subprocess.run(["python", "-m", "spacy", "download", "fr_core_news_sm"])
	nlp = spacy.load("fr_core_news_sm")

	# Function to lemmatize text using spaCy
	def lemmatize_text(text):
	doc = nlp(text)
	return " ".join([token.lemma_ for token in doc])

	# Global variables to store the corpus
	raw_corpus = {} # To store raw texts
	lemmatized_corpus = {} # To store lemmatized texts
	initial_df = pd.DataFrame()

	# Function to process the zip file, lemmatize text, get document names, and calculate word counts
	def process_zip_initial(zip_file):
	global raw_corpus, lemmatized_corpus, initial_df # To store the raw texts, lemmatized texts, and DataFrame
	raw_corpus = {}
	lemmatized_corpus = {} # Reset the corpus on new upload

	# Create a temporary directory to extract files
	with tempfile.TemporaryDirectory() as temp_dir:
	# Extract the zip file
	with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
	zip_ref.extractall(temp_dir)

	# Recursively get list of all .txt files in all directories and lemmatize the text
	txt_files = []
	word_counts = []
	for root, dirs, files in os.walk(temp_dir):
	for file in files:
	if file.endswith('.txt'):
	file_path = os.path.join(root, file)
	txt_files.append(os.path.basename(file_path)) # Only the file name

	# Read the text
	with open(file_path, 'r', encoding='utf-8') as f:
	text = f.read()
	word_count = len(text.split()) # Split text by spaces to count words
	word_counts.append(word_count)

	# Store raw text in raw_corpus
	raw_corpus[os.path.basename(file_path)] = text.lower()

	# Lemmatize the text and store in lemmatized_corpus
	lemmatized_text = lemmatize_text(text.lower())
	lemmatized_corpus[os.path.basename(file_path)] = lemmatized_text

	# Create a DataFrame with document names and word counts
	initial_df = pd.DataFrame({"Nom du document": txt_files, "N. mots": word_counts})

	return initial_df

	# Function to search for keywords in the selected corpus (raw or lemmatized)
	def process_zip_and_search(keywords_text, search_mode):
	global raw_corpus, lemmatized_corpus, initial_df # Use the texts stored at corpus upload and initial DataFrame

	# Read the keywords (no lemmatization of keywords)
	keywords = [keyword.strip().lower() for keyword in keywords_text.strip().split("\n") if keyword.strip()]

	if not keywords:
	# If no keywords are provided, return the initial DataFrame (without the keyword columns)
	return initial_df

	# Select the appropriate corpus based on the search mode
	corpus = lemmatized_corpus if search_mode == "Lemmes" else raw_corpus

	# Prepare a dictionary to store the results (initialize with Document Name and empty results)
	results = {doc_name: {keyword: "" for keyword in keywords} for doc_name in corpus.keys()}

	# Search for keyword frequencies in each text file
	for doc_name, text in corpus.items():
	for keyword in keywords:
	keyword_count = text.count(keyword) # Count occurrences of each keyword
	if keyword_count > 0:
	results[doc_name][keyword] = keyword_count

	# Convert the results dictionary to a DataFrame
	df_keywords = pd.DataFrame(results).T # Transpose to have files as rows and keywords as columns

	# Reset index to make the document names a column
	df_keywords.reset_index(inplace=True)

	# Rename the first column to 'Nom du document'
	df_keywords.rename(columns={"index": "Nom du document"}, inplace=True)

	# Replace 0 frequencies with empty strings
	df_keywords.replace(0, "", inplace=True)

	# Merge the initial DataFrame with the keyword search results
	final_df = pd.merge(initial_df, df_keywords, on="Nom du document", how="left")

	return final_df


	# Function to export the DataFrame to Excel
	def export_to_excel(df):
	# Create a temporary directory for storing the Excel file
	with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
	excel_path = tmp.name
	# Save the DataFrame to Excel
	df.to_excel(excel_path, index=False)
	return excel_path

	# Create Gradio interface with one results table and export functionality
	with gr.Blocks() as demo:
	gr.Markdown("# Recherche simple par mots-clés avec lemmatisation") # This line adds the title

	with gr.Row():
	# File upload and initial table with document names
	zip_file_input = gr.File(label="Téléversez votre dossier .zip contenant les fichiers texte (format .txt)")

	with gr.Row():
	# Textbox for entering keywords
	keywords_input = gr.Textbox(label="Entrez les mots clés (un par ligne, peuvent contenir plus d'un mot)", placeholder="mots-clés...", lines=10)

	with gr.Row():
	# Switch button to select between raw tokens and lemmatized search
	search_mode = gr.Radio(label="Choisissez le type de recherche", choices=["Mots", "Lemmes"], value="Lemmes")

	with gr.Row():
	# Button to trigger keyword search
	search_button = gr.Button("Recherche")

	# Output the final results table after the search button
	with gr.Row():
	result_table = gr.DataFrame(label="Résultats", col_count=(1, "dynamic"), interactive=False) # Disable renaming/editing

	# Button to trigger the Excel export
	with gr.Row():
	export_button = gr.Button("Exporter vers Excel (.xlsx)")
	download_link = gr.File(label="Télécharger le fichier")

	# Action to display document names and lemmatized text upon ZIP upload
	zip_file_input.change(fn=process_zip_initial, inputs=zip_file_input, outputs=result_table)

	# Action to update the table with keywords and results based on the selected search mode
	search_button.click(fn=process_zip_and_search, inputs=[keywords_input, search_mode], outputs=result_table)

	# Action to export the results to Excel
	export_button.click(fn=export_to_excel, inputs=result_table, outputs=download_link)

	# Launch the app
	demo.launch()