Spaces:
Paused
Paused
File size: 12,541 Bytes
3cf27bd da60cd1 33d554a 36709a2 571f7e3 3cf27bd 36709a2 33d554a 3cf27bd 33d554a 36709a2 33d554a 36709a2 33d554a 3cf27bd 36709a2 3cf27bd 33d554a da60cd1 33d554a 571f7e3 36709a2 33d554a 571f7e3 36709a2 33d554a 36709a2 33d554a 36709a2 52d93e9 33d554a 36709a2 571f7e3 36709a2 571f7e3 33d554a da60cd1 33d554a 36709a2 33d554a 36709a2 33d554a 36709a2 33d554a 36709a2 33d554a 36709a2 33d554a 36709a2 33d554a 36709a2 33d554a 36709a2 33d554a 36709a2 33d554a 36709a2 33d554a 36709a2 571f7e3 36709a2 33d554a 36709a2 33d554a 36709a2 33d554a 36709a2 33d554a 36709a2 33d554a 36709a2 33d554a 36709a2 33d554a ca58a74 36709a2 571f7e3 ca58a74 33d554a 36709a2 ca58a74 33d554a 36709a2 33d554a 3cf27bd 36709a2 33d554a 3cf27bd 33d554a da60cd1 33d554a 36709a2 571f7e3 33d554a 36709a2 571f7e3 33d554a 36709a2 33d554a 36709a2 33d554a 571f7e3 36709a2 33d554a 52d93e9 36709a2 571f7e3 36709a2 33d554a 36709a2 33d554a 36709a2 33d554a 36709a2 33d554a 36709a2 33d554a 36709a2 33d554a 36709a2 33d554a da60cd1 33d554a 36709a2 33d554a 36709a2 33d554a 36709a2 33d554a 36709a2 33d554a 36709a2 33d554a 36709a2 33d554a 36709a2 33d554a 36709a2 33d554a 571f7e3 36709a2 3cf27bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 |
import gradio as gr
import markdown
from markdown.extensions.tables import TableExtension
from markdown.extensions.fenced_code import FencedCodeExtension
from markdown.extensions.toc import TocExtension
from markdown.extensions.attr_list import AttrListExtension
from markdown.extensions.codehilite import CodeHiliteExtension
import requests
from bs4 import BeautifulSoup
# For ReaderLM-2 functionality
from transformers import pipeline
# For ReaderLM-1 functionality
from transformers import AutoTokenizer, AutoModelForCausalLM
import spaces
import re
from markdownify import markdownify
######################################
# 1) MARKDOWN-STUDIO FUNCTIONALITY #
######################################
def render_markdown(md_text):
"""
Render a string of Markdown text into HTML using various Markdown extensions.
- Supports tables, fenced code blocks, TOC, attribute lists, and syntax highlighting.
"""
print("Rendering markdown input to HTML...") # Debug log
return markdown.markdown(
md_text,
extensions=[
TableExtension(), # Adds support for Markdown tables
FencedCodeExtension(), # Allows for fenced code blocks
TocExtension(baselevel=2), # Generates a Table of Contents starting at level 2
AttrListExtension(), # Enables attribute lists for elements
CodeHiliteExtension(linenums=False, css_class="highlight"), # Syntax highlighting for code blocks
],
)
######################################
# 2) VIEW SOURCE INFO FUNCTIONALITY #
######################################
def view_source_info(url):
"""
Fetch the HTML source of the given URL.
- Supports `view-source:` prefix or plain URLs.
"""
if url.startswith("view-source:"):
url = url.replace("view-source:", "").strip()
if not url.startswith(("http://", "https://")):
url = "https://" + url
print(f"Fetching source for URL: {url}...") # Debug log
response = requests.get(url)
return response.text
######################################
# 3) READERLM-2 FUNCTIONALITY #
######################################
# Load the JinaAI ReaderLM-v2 model
model_name = "jinaai/ReaderLM-v2"
print(f"Loading model: {model_name}...") # Debug log
html_converter = pipeline("text-generation", model=model_name)
def convert_html(html_input, output_format, custom_prompt=None):
"""
Use the JinaAI ReaderLM-v2 pipeline to convert HTML into Markdown or JSON.
- Takes raw HTML as input and converts it to the specified output format.
- Allows for a custom system prompt.
"""
if custom_prompt:
prompt = f"{custom_prompt}\n\n{html_input}"
else:
prompt = f"Convert the following HTML into {output_format}:\n\n{html_input}"
print(f"Converting HTML to {output_format} with prompt: {custom_prompt or 'default'}...") # Debug log
print(f"HTML input: {html_input[:100]}...") # Debug log, preview first 100 characters of input
# Use the pipeline to generate the conversion
response = html_converter(prompt, max_length=9999, num_return_sequences=1)
converted_output = response[0]['generated_text']
# Remove the prompt from the output to clean up the response
converted_output = converted_output.replace(prompt, "").strip()
print("Conversion completed.") # Debug log
return converted_output
######################################
# 4) READERLM-1 FUNCTIONALITY #
######################################
# Prepare models and tokenizers for ReaderLM-1
print("Loading ReaderLM-1 models and tokenizers...") # Debug log
models = {
"jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained(
"jinaai/reader-lm-0.5b", trust_remote_code=True
).eval().to("cuda"), # Load the smaller 0.5b model onto the GPU
"jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained(
"jinaai/reader-lm-1.5b", trust_remote_code=True
).eval().to("cuda"), # Load the larger 1.5b model onto the GPU
}
tokenizers = {
"jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained(
"jinaai/reader-lm-0.5b", trust_remote_code=True
),
"jinaai/reader-lm-1.5b": AutoTokenizer.from_pretrained(
"jinaai/reader-lm-1.5b", trust_remote_code=True
),
}
@spaces.GPU
def run_example(html_content, model_id="jinaai/reader-lm-1.5b"):
"""
Generate Markdown from HTML using ReaderLM (0.5b or 1.5b) models.
- Includes both model-based generation and a rule-based markdownify output.
"""
print(f"Running example with model: {model_id}...") # Debug log
model = models[model_id] # Select the model based on the input ID
tokenizer = tokenizers[model_id] # Retrieve the corresponding tokenizer
# Construct the chat-based input for the model
messages = [{"role": "user", "content": html_content}]
input_text = tokenizer.apply_chat_template(messages, tokenize=False) # Format input text for the model
print(f"Generated input text for model: {input_text[:100]}...") # Debug log, preview input text
# Tokenize the input text
inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
# Generate output using the model
outputs = model.generate(
inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08
)
# Extract the assistant's response from the generated output
pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>"
assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL)
print("Model generation completed.") # Debug log
# Use markdownify as a rule-based fallback for comparison
markdownify_output = markdownify(html_content)
print("Rule-based markdownify output generated.") # Debug log
# Return both model-based and rule-based outputs
return assistant_response[0], markdownify_output
# Example HTML for ReaderLM-1
example_html = """<div id="myDIV" class="header">
<h2>My To Do List</h2>
<input type="text" id="myInput" placeholder="Title...">
<span onclick="newElement()" class="addBtn">Add</span>
</div>
<ul id="myUL">
<li>Hit the gym</li>
<li class="checked">Pay bills</li>
<li>Meet George</li>
<li>Buy eggs</li>
<li>Read a book</li>
<li>Organize office</li>
</ul>"""
########################################################
# Combine everything into a single Gradio Blocks app #
########################################################
# Optional extra CSS for styling the ReaderLM-1 tab
css = """
#output {
height: 500px; # Set the height of the output box
overflow: auto; # Enable scrolling for large content
border: 1px solid #ccc; # Add a border around the box
}
"""
# Initialize the Gradio app with the Nymbo/Nymbo_Theme for styling
print("Initializing Gradio app...") # Debug log
with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
########################################################
# TAB 1: Markdown Suite (live preview)
########################################################
with gr.Tab("Live Preview"):
gr.Markdown("# Markdown Suite") # Add a title for the tab
with gr.Row():
with gr.Column():
md_input = gr.Textbox(
lines=20,
placeholder="Write your markdown here...",
label="Markdown Input", # Input for Markdown text
)
with gr.Column():
md_output = gr.HTML(
label="Rendered Output" # Display the rendered HTML output
)
# Update the output whenever the input changes
md_input.change(fn=render_markdown, inputs=md_input, outputs=md_output)
########################################################
# TAB 2: View Source Info
########################################################
with gr.Tab("View Source Info"):
gr.Markdown("## View HTML Source Code") # Tab description
source_input = gr.Textbox(
label="Enter URL (with or without 'view-source:')",
placeholder="e.g., https://example.com or view-source:example.com"
)
source_output = gr.Textbox(
label="HTML Source Code",
lines=20
)
source_button = gr.Button("Fetch Source") # Button to fetch source
source_button.click(
fn=view_source_info,
inputs=source_input,
outputs=source_output
)
########################################################
# TAB 3: ReaderLM-2 Converter (HTML → Markdown/JSON)
########################################################
with gr.Tab("ReaderLM-2 Converter"):
gr.Markdown("## HTML to Markdown/JSON Converter (ReaderLM-v2)") # Tab description
with gr.Row():
html_input_2 = gr.Textbox(
lines=10,
placeholder="Paste your raw HTML here...",
label="Raw HTML Input" # Input for raw HTML
)
output_format_2 = gr.Radio(
["Markdown", "JSON"], # Choose the output format
label="Output Format",
value="Markdown" # Default to Markdown output
)
custom_prompt_2 = gr.Textbox(
lines=2,
placeholder="Optional: Enter a custom prompt...",
label="Custom System Prompt"
)
convert_btn_2 = gr.Button("Convert") # Button to trigger conversion
converted_output_2 = gr.Textbox(
lines=20,
label="Converted Output" # Display the converted output
)
# Provide usage details for the converter
gr.Markdown(
"Convert raw HTML into formatted Markdown or JSON using **JinaAI ReaderLM-v2**."
)
# Connect the button click event to the conversion function
convert_btn_2.click(
fn=convert_html,
inputs=[html_input_2, output_format_2, custom_prompt_2],
outputs=converted_output_2
)
# Add example inputs for demonstration
gr.Examples(
examples=[
["<h1>Hello World</h1><p>This is a <strong>test</strong>.</p>", "Markdown", "Optional custom prompt"],
["<ul><li>Item 1</li><li>Item 2</li></ul>", "JSON", "Optional custom prompt"]
],
inputs=[html_input_2, output_format_2, custom_prompt_2],
outputs=converted_output_2,
fn=convert_html,
cache_examples=False # Disable caching for dynamic examples
)
########################################################
# TAB 4: ReaderLM-1 HTML-to-Markdown
########################################################
with gr.Tab("ReaderLM-1 Converter"):
gr.Markdown("""
# HTML-to-Markdown with ReaderLM-1
Use either **jinaai/reader-lm-0.5b** or **jinaai/reader-lm-1.5b**
to convert HTML to Markdown. Compare against rule-based `markdownify`.
""")
with gr.Row():
with gr.Column():
model_selector = gr.Dropdown(
choices=list(models.keys()), # Allow selection between the two models
label="Model",
value="jinaai/reader-lm-1.5b" # Default to the larger model
)
html_content = gr.Textbox(
label="HTML" # Input for raw HTML
)
submit_btn = gr.Button(value="Submit") # Button to trigger the model
with gr.Column():
model_output_text = gr.Textbox(label="Reader LM Output") # Model-generated Markdown
markdownify_output = gr.Textbox(label="Markdownify Output") # Rule-based Markdown
# Add example HTML input for demonstration
gr.Examples(
examples=[
[example_html],
],
inputs=[html_content],
outputs=[model_output_text, markdownify_output],
fn=run_example,
cache_examples=True, # Cache example outputs
label="Try example HTML"
)
# Connect the submit button to the run_example function
submit_btn.click(
fn=run_example,
inputs=[html_content, model_selector],
outputs=[model_output_text, markdownify_output]
)
# Finally, launch the combined demo app
print("Launching the demo...") # Debug log
demo.launch() |