Upload app.py
Browse files
app.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import re
|
3 |
+
import pandas as pd
|
4 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
5 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
+
|
7 |
+
# Streamlit app interface
|
8 |
+
st.title('Site Migration URL Mapper')
|
9 |
+
|
10 |
+
# Text areas for old and new site URLs
|
11 |
+
old_site_urls = st.text_area('Enter the old site URLs (one per line)').split('\n')
|
12 |
+
new_site_urls = st.text_area('Enter the new site URLs (one per line)').split('\n')
|
13 |
+
|
14 |
+
def preprocess_url(url):
|
15 |
+
"""Preprocess the URL to extract keywords."""
|
16 |
+
processed_url = re.sub(r'https?:\/\/(?:www\.)?[^\/]+', '', url)
|
17 |
+
processed_url = re.sub(r'\.\w+$', '', processed_url)
|
18 |
+
processed_url = re.sub(r'[^a-z0-9\s]', ' ', processed_url.lower())
|
19 |
+
return processed_url
|
20 |
+
|
21 |
+
def map_urls(old_urls, new_urls):
|
22 |
+
"""Map old site URLs to new site URLs based on content similarity using TF-IDF."""
|
23 |
+
old_urls_processed = [preprocess_url(url) for url in old_urls]
|
24 |
+
new_urls_processed = [preprocess_url(url) for url in new_urls]
|
25 |
+
|
26 |
+
vectorizer = TfidfVectorizer()
|
27 |
+
tfidf_matrix = vectorizer.fit_transform(old_urls_processed + new_urls_processed)
|
28 |
+
|
29 |
+
old_vectors = tfidf_matrix[:len(old_urls)]
|
30 |
+
new_vectors = tfidf_matrix[len(old_urls):]
|
31 |
+
|
32 |
+
similarity_matrix = cosine_similarity(old_vectors, new_vectors)
|
33 |
+
mappings = []
|
34 |
+
|
35 |
+
for idx, old_url in enumerate(old_urls):
|
36 |
+
best_match_idx = similarity_matrix[idx].argmax()
|
37 |
+
best_match_url = new_urls[best_match_idx]
|
38 |
+
mappings.append((old_url, best_match_url))
|
39 |
+
|
40 |
+
return mappings
|
41 |
+
|
42 |
+
if st.button('Generate Mappings'):
|
43 |
+
if old_site_urls and new_site_urls:
|
44 |
+
mappings = map_urls(old_site_urls, new_site_urls)
|
45 |
+
df_mappings = pd.DataFrame(mappings, columns=['Old URL', 'New URL'])
|
46 |
+
st.dataframe(df_mappings)
|
47 |
+
|
48 |
+
# Download button for the mappings
|
49 |
+
csv = df_mappings.to_csv(index=False).encode('utf-8')
|
50 |
+
st.download_button("Download Mappings", csv, "url_mappings.csv", "text/csv", key='download-csv')
|
51 |
+
else:
|
52 |
+
st.error("Please enter URLs for both old and new sites.")
|