Spaces:
Paused
Paused
adding adding source fetcher
Browse files
app.py
CHANGED
@@ -5,6 +5,8 @@ from markdown.extensions.fenced_code import FencedCodeExtension
|
|
5 |
from markdown.extensions.toc import TocExtension
|
6 |
from markdown.extensions.attr_list import AttrListExtension
|
7 |
from markdown.extensions.codehilite import CodeHiliteExtension
|
|
|
|
|
8 |
|
9 |
# For ReaderLM-2 functionality
|
10 |
from transformers import pipeline
|
@@ -36,7 +38,23 @@ def render_markdown(md_text):
|
|
36 |
)
|
37 |
|
38 |
######################################
|
39 |
-
# 2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
######################################
|
41 |
# Load the JinaAI ReaderLM-v2 model
|
42 |
model_name = "jinaai/ReaderLM-v2"
|
@@ -67,7 +85,7 @@ def convert_html(html_input, output_format, custom_prompt=None):
|
|
67 |
return converted_output
|
68 |
|
69 |
######################################
|
70 |
-
#
|
71 |
######################################
|
72 |
# Prepare models and tokenizers for ReaderLM-1
|
73 |
print("Loading ReaderLM-1 models and tokenizers...") # Debug log
|
@@ -178,7 +196,29 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
|
|
178 |
md_input.change(fn=render_markdown, inputs=md_input, outputs=md_output)
|
179 |
|
180 |
########################################################
|
181 |
-
# TAB 2:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
########################################################
|
183 |
with gr.Tab("ReaderLM-2 Converter"):
|
184 |
gr.Markdown("## HTML to Markdown/JSON Converter (ReaderLM-v2)") # Tab description
|
@@ -231,7 +271,7 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
|
|
231 |
)
|
232 |
|
233 |
########################################################
|
234 |
-
# TAB
|
235 |
########################################################
|
236 |
with gr.Tab("ReaderLM-1 Converter"):
|
237 |
gr.Markdown("""
|
|
|
5 |
from markdown.extensions.toc import TocExtension
|
6 |
from markdown.extensions.attr_list import AttrListExtension
|
7 |
from markdown.extensions.codehilite import CodeHiliteExtension
|
8 |
+
import requests
|
9 |
+
from bs4 import BeautifulSoup
|
10 |
|
11 |
# For ReaderLM-2 functionality
|
12 |
from transformers import pipeline
|
|
|
38 |
)
|
39 |
|
40 |
######################################
|
41 |
+
# 2) VIEW SOURCE INFO FUNCTIONALITY #
|
42 |
+
######################################
|
43 |
+
def view_source_info(url):
|
44 |
+
"""
|
45 |
+
Fetch the HTML source of the given URL.
|
46 |
+
- Supports `view-source:` prefix or plain URLs.
|
47 |
+
"""
|
48 |
+
if url.startswith("view-source:"):
|
49 |
+
url = url.replace("view-source:", "").strip()
|
50 |
+
if not url.startswith(("http://", "https://")):
|
51 |
+
url = "https://" + url
|
52 |
+
print(f"Fetching source for URL: {url}...") # Debug log
|
53 |
+
response = requests.get(url)
|
54 |
+
return response.text
|
55 |
+
|
56 |
+
######################################
|
57 |
+
# 3) READERLM-2 FUNCTIONALITY #
|
58 |
######################################
|
59 |
# Load the JinaAI ReaderLM-v2 model
|
60 |
model_name = "jinaai/ReaderLM-v2"
|
|
|
85 |
return converted_output
|
86 |
|
87 |
######################################
|
88 |
+
# 4) READERLM-1 FUNCTIONALITY #
|
89 |
######################################
|
90 |
# Prepare models and tokenizers for ReaderLM-1
|
91 |
print("Loading ReaderLM-1 models and tokenizers...") # Debug log
|
|
|
196 |
md_input.change(fn=render_markdown, inputs=md_input, outputs=md_output)
|
197 |
|
198 |
########################################################
|
199 |
+
# TAB 2: View Source Info
|
200 |
+
########################################################
|
201 |
+
with gr.Tab("View Source Info"):
|
202 |
+
gr.Markdown("## View HTML Source Code") # Tab description
|
203 |
+
|
204 |
+
source_input = gr.Textbox(
|
205 |
+
label="Enter URL (with or without 'view-source:')",
|
206 |
+
placeholder="e.g., https://example.com or view-source:example.com"
|
207 |
+
)
|
208 |
+
source_output = gr.Textbox(
|
209 |
+
label="HTML Source Code",
|
210 |
+
lines=20
|
211 |
+
)
|
212 |
+
source_button = gr.Button("Fetch Source") # Button to fetch source
|
213 |
+
|
214 |
+
source_button.click(
|
215 |
+
fn=view_source_info,
|
216 |
+
inputs=source_input,
|
217 |
+
outputs=source_output
|
218 |
+
)
|
219 |
+
|
220 |
+
########################################################
|
221 |
+
# TAB 3: ReaderLM-2 Converter (HTML → Markdown/JSON)
|
222 |
########################################################
|
223 |
with gr.Tab("ReaderLM-2 Converter"):
|
224 |
gr.Markdown("## HTML to Markdown/JSON Converter (ReaderLM-v2)") # Tab description
|
|
|
271 |
)
|
272 |
|
273 |
########################################################
|
274 |
+
# TAB 4: ReaderLM-1 HTML-to-Markdown
|
275 |
########################################################
|
276 |
with gr.Tab("ReaderLM-1 Converter"):
|
277 |
gr.Markdown("""
|