Nymbo commited on
Commit
da60cd1
·
verified ·
1 Parent(s): c2c6d29

adding adding source fetcher

Browse files
Files changed (1) hide show
  1. app.py +44 -4
app.py CHANGED
@@ -5,6 +5,8 @@ from markdown.extensions.fenced_code import FencedCodeExtension
5
  from markdown.extensions.toc import TocExtension
6
  from markdown.extensions.attr_list import AttrListExtension
7
  from markdown.extensions.codehilite import CodeHiliteExtension
 
 
8
 
9
  # For ReaderLM-2 functionality
10
  from transformers import pipeline
@@ -36,7 +38,23 @@ def render_markdown(md_text):
36
  )
37
 
38
  ######################################
39
- # 2) READERLM-2 FUNCTIONALITY #
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  ######################################
41
  # Load the JinaAI ReaderLM-v2 model
42
  model_name = "jinaai/ReaderLM-v2"
@@ -67,7 +85,7 @@ def convert_html(html_input, output_format, custom_prompt=None):
67
  return converted_output
68
 
69
  ######################################
70
- # 3) READERLM-1 FUNCTIONALITY #
71
  ######################################
72
  # Prepare models and tokenizers for ReaderLM-1
73
  print("Loading ReaderLM-1 models and tokenizers...") # Debug log
@@ -178,7 +196,29 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
178
  md_input.change(fn=render_markdown, inputs=md_input, outputs=md_output)
179
 
180
  ########################################################
181
- # TAB 2: ReaderLM-2 Converter (HTML → Markdown/JSON)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  ########################################################
183
  with gr.Tab("ReaderLM-2 Converter"):
184
  gr.Markdown("## HTML to Markdown/JSON Converter (ReaderLM-v2)") # Tab description
@@ -231,7 +271,7 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
231
  )
232
 
233
  ########################################################
234
- # TAB 3: ReaderLM-1 HTML-to-Markdown
235
  ########################################################
236
  with gr.Tab("ReaderLM-1 Converter"):
237
  gr.Markdown("""
 
5
  from markdown.extensions.toc import TocExtension
6
  from markdown.extensions.attr_list import AttrListExtension
7
  from markdown.extensions.codehilite import CodeHiliteExtension
8
+ import requests
9
+ from bs4 import BeautifulSoup
10
 
11
  # For ReaderLM-2 functionality
12
  from transformers import pipeline
 
38
  )
39
 
40
  ######################################
41
+ # 2) VIEW SOURCE INFO FUNCTIONALITY #
42
+ ######################################
43
+ def view_source_info(url):
44
+ """
45
+ Fetch the HTML source of the given URL.
46
+ - Supports `view-source:` prefix or plain URLs.
47
+ """
48
+ if url.startswith("view-source:"):
49
+ url = url.replace("view-source:", "").strip()
50
+ if not url.startswith(("http://", "https://")):
51
+ url = "https://" + url
52
+ print(f"Fetching source for URL: {url}...") # Debug log
53
+ response = requests.get(url)
54
+ return response.text
55
+
56
+ ######################################
57
+ # 3) READERLM-2 FUNCTIONALITY #
58
  ######################################
59
  # Load the JinaAI ReaderLM-v2 model
60
  model_name = "jinaai/ReaderLM-v2"
 
85
  return converted_output
86
 
87
  ######################################
88
+ # 4) READERLM-1 FUNCTIONALITY #
89
  ######################################
90
  # Prepare models and tokenizers for ReaderLM-1
91
  print("Loading ReaderLM-1 models and tokenizers...") # Debug log
 
196
  md_input.change(fn=render_markdown, inputs=md_input, outputs=md_output)
197
 
198
  ########################################################
199
+ # TAB 2: View Source Info
200
+ ########################################################
201
+ with gr.Tab("View Source Info"):
202
+ gr.Markdown("## View HTML Source Code") # Tab description
203
+
204
+ source_input = gr.Textbox(
205
+ label="Enter URL (with or without 'view-source:')",
206
+ placeholder="e.g., https://example.com or view-source:example.com"
207
+ )
208
+ source_output = gr.Textbox(
209
+ label="HTML Source Code",
210
+ lines=20
211
+ )
212
+ source_button = gr.Button("Fetch Source") # Button to fetch source
213
+
214
+ source_button.click(
215
+ fn=view_source_info,
216
+ inputs=source_input,
217
+ outputs=source_output
218
+ )
219
+
220
+ ########################################################
221
+ # TAB 3: ReaderLM-2 Converter (HTML → Markdown/JSON)
222
  ########################################################
223
  with gr.Tab("ReaderLM-2 Converter"):
224
  gr.Markdown("## HTML to Markdown/JSON Converter (ReaderLM-v2)") # Tab description
 
271
  )
272
 
273
  ########################################################
274
+ # TAB 4: ReaderLM-1 HTML-to-Markdown
275
  ########################################################
276
  with gr.Tab("ReaderLM-1 Converter"):
277
  gr.Markdown("""