Mishal23 commited on
Commit
dcfc8fd
Β·
verified Β·
1 Parent(s): 765a30b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -46
app.py CHANGED
@@ -1,117 +1,105 @@
1
  import gradio as gr
2
  import requests
3
  from bs4 import BeautifulSoup
4
- import pdfkit
5
  import os
6
  import math
 
 
7
  # Function to extract all links from a website
8
  def extract_links(url):
9
  try:
10
  response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
11
  if response.status_code != 200:
12
  return f"Error: Unable to fetch page (Status Code {response.status_code})", []
13
-
14
  soup = BeautifulSoup(response.text, "html.parser")
15
  base_url = "/".join(url.split("/")[:3]) # Extract base domain
16
-
17
  links = []
18
  for a_tag in soup.find_all("a", href=True):
19
  href = a_tag["href"]
20
  if not href.startswith("http"): # Convert relative links to absolute
21
  href = base_url + href if href.startswith("/") else base_url + "/" + href
22
-
23
  links.append(href)
24
 
25
  links = list(set(links)) # Remove duplicates
26
  if not links:
27
  return "No links found on the website.", []
28
-
29
- return f"βœ… {len(links)} links found! Select which ones to convert into PDFs:", links
30
 
31
  except Exception as e:
32
  return f"Error: {str(e)}", []
33
 
34
  # Function to clean unwanted content (like headers, footers, etc.)
35
  def clean_content(soup):
36
- # Remove common unwanted elements
37
  for tag in soup.find_all(["header", "footer", "nav", "aside", "script", "style"]):
38
  tag.decompose() # Remove the tag completely
39
-
40
- # You can also remove specific classes or IDs if necessary, for example:
41
- # for tag in soup.find_all(attrs={"class": "footer"}):
42
- # tag.decompose()
43
-
44
- # Get the cleaned text from the remaining content
45
  return soup.get_text(separator="\n", strip=True)
46
 
47
- # Function to scrape selected links and generate PDFs
48
- def scrape_and_generate_pdfs(selected_links):
49
  try:
50
  if not selected_links:
51
  return "No links selected.", None
52
-
53
- pdf_files = []
54
- batch_size = 4 # Each PDF contains up to 4 links
55
 
56
- # Process selected links in batches of 4
 
 
57
  for i in range(0, len(selected_links), batch_size):
58
  batch_links = selected_links[i:i + batch_size]
59
- all_text = ""
60
 
61
- # Scrape text content from each selected link
62
  for link in batch_links:
63
  try:
64
  response = requests.get(link, headers={"User-Agent": "Mozilla/5.0"})
65
  if response.status_code == 200:
66
  soup = BeautifulSoup(response.text, "html.parser")
67
  page_text = clean_content(soup)
68
- all_text += f"--- Content from: {link} ---\n\n" + page_text + "\n\n"
69
- except:
70
- all_text += f"Failed to fetch content from {link}\n\n"
71
-
72
- if all_text:
73
- pdf_filename = f"output_{(i//batch_size) + 1}.pdf"
74
-
75
- # Save as temporary HTML file
76
- html_path = f"temp_{i}.html"
77
- with open(html_path, "w", encoding="utf-8") as f:
78
- f.write(f"<html><body><pre>{all_text}</pre></body></html>")
79
 
80
- # Convert HTML to PDF
81
- pdfkit.from_file(html_path, pdf_filename)
82
- os.remove(html_path)
 
 
 
83
 
84
- pdf_files.append(pdf_filename)
 
 
 
85
 
86
- return pdf_files # Return list of generated PDFs
87
 
88
  except Exception as e:
89
  return f"Error: {str(e)}", None
90
 
91
  # Gradio UI with link selection
92
- def show_links_and_generate_pdfs(url):
93
  message, links = extract_links(url)
94
  if not links:
95
  return message, gr.update(choices=[], value=[])
96
-
97
  return message, gr.update(choices=links, value=[])
98
 
99
  iface = gr.Blocks()
100
 
101
  with iface:
102
- gr.Markdown("### 🌐 Web Scraper & PDF Generator")
103
- gr.Markdown("Enter a website URL to extract internal links, then select which links to convert into PDFs (4 links per PDF).")
104
 
105
  url_input = gr.Textbox(label="Enter Website URL")
106
  extract_btn = gr.Button("Extract Links")
107
 
108
  message_output = gr.Markdown("")
109
  link_selector = gr.CheckboxGroup([], label="Select Links to Convert", interactive=True)
110
- generate_btn = gr.Button("Generate PDFs")
111
-
112
- pdf_output = gr.File(label="Download Generated PDFs")
113
 
114
- extract_btn.click(show_links_and_generate_pdfs, inputs=url_input, outputs=[message_output, link_selector])
115
- generate_btn.click(scrape_and_generate_pdfs, inputs=link_selector, outputs=pdf_output)
116
 
117
  iface.launch()
 
1
  import gradio as gr
2
  import requests
3
  from bs4 import BeautifulSoup
 
4
  import os
5
  import math
6
+ from docx import Document # Import for Word file generation
7
+
8
  # Function to extract all links from a website
9
  def extract_links(url):
10
  try:
11
  response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
12
  if response.status_code != 200:
13
  return f"Error: Unable to fetch page (Status Code {response.status_code})", []
14
+
15
  soup = BeautifulSoup(response.text, "html.parser")
16
  base_url = "/".join(url.split("/")[:3]) # Extract base domain
17
+
18
  links = []
19
  for a_tag in soup.find_all("a", href=True):
20
  href = a_tag["href"]
21
  if not href.startswith("http"): # Convert relative links to absolute
22
  href = base_url + href if href.startswith("/") else base_url + "/" + href
23
+
24
  links.append(href)
25
 
26
  links = list(set(links)) # Remove duplicates
27
  if not links:
28
  return "No links found on the website.", []
29
+
30
+ return f"βœ… {len(links)} links found! Select which ones to convert into Word files:", links
31
 
32
  except Exception as e:
33
  return f"Error: {str(e)}", []
34
 
35
  # Function to clean unwanted content (like headers, footers, etc.)
36
  def clean_content(soup):
 
37
  for tag in soup.find_all(["header", "footer", "nav", "aside", "script", "style"]):
38
  tag.decompose() # Remove the tag completely
39
+
 
 
 
 
 
40
  return soup.get_text(separator="\n", strip=True)
41
 
42
+ # Function to scrape selected links and generate Word files
43
+ def scrape_and_generate_word(selected_links):
44
  try:
45
  if not selected_links:
46
  return "No links selected.", None
 
 
 
47
 
48
+ word_files = []
49
+ batch_size = 4 # Each Word file contains up to 4 links
50
+
51
  for i in range(0, len(selected_links), batch_size):
52
  batch_links = selected_links[i:i + batch_size]
53
+ doc = Document()
54
 
 
55
  for link in batch_links:
56
  try:
57
  response = requests.get(link, headers={"User-Agent": "Mozilla/5.0"})
58
  if response.status_code == 200:
59
  soup = BeautifulSoup(response.text, "html.parser")
60
  page_text = clean_content(soup)
 
 
 
 
 
 
 
 
 
 
 
61
 
62
+ # Add title for each link
63
+ doc.add_heading(f"Content from: {link}", level=1)
64
+ doc.add_paragraph(page_text)
65
+ doc.add_page_break() # Ensure proper formatting
66
+ except:
67
+ doc.add_paragraph(f"Failed to fetch content from {link}\n\n")
68
 
69
+ # Save the Word file
70
+ word_filename = f"output_{(i//batch_size) + 1}.docx"
71
+ doc.save(word_filename)
72
+ word_files.append(word_filename)
73
 
74
+ return word_files # Return list of generated Word files
75
 
76
  except Exception as e:
77
  return f"Error: {str(e)}", None
78
 
79
  # Gradio UI with link selection
80
+ def show_links_and_generate_word(url):
81
  message, links = extract_links(url)
82
  if not links:
83
  return message, gr.update(choices=[], value=[])
84
+
85
  return message, gr.update(choices=links, value=[])
86
 
87
  iface = gr.Blocks()
88
 
89
  with iface:
90
+ gr.Markdown("### 🌐 Web Scraper & Word Document Generator")
91
+ gr.Markdown("Enter a website URL to extract internal links, then select which links to convert into Word files (4 links per file).")
92
 
93
  url_input = gr.Textbox(label="Enter Website URL")
94
  extract_btn = gr.Button("Extract Links")
95
 
96
  message_output = gr.Markdown("")
97
  link_selector = gr.CheckboxGroup([], label="Select Links to Convert", interactive=True)
98
+ generate_btn = gr.Button("Generate Word Files")
99
+
100
+ word_output = gr.File(label="Download Generated Word Files")
101
 
102
+ extract_btn.click(show_links_and_generate_word, inputs=url_input, outputs=[message_output, link_selector])
103
+ generate_btn.click(scrape_and_generate_word, inputs=link_selector, outputs=word_output)
104
 
105
  iface.launch()