File size: 12,541 Bytes
3cf27bd
 
 
 
 
 
 
da60cd1
 
33d554a
36709a2
571f7e3
3cf27bd
36709a2
33d554a
 
 
 
 
 
 
 
3cf27bd
33d554a
36709a2
 
33d554a
36709a2
33d554a
3cf27bd
 
36709a2
 
 
 
 
3cf27bd
 
 
33d554a
da60cd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33d554a
571f7e3
 
36709a2
33d554a
571f7e3
36709a2
33d554a
 
36709a2
 
33d554a
36709a2
 
 
 
 
 
 
 
 
52d93e9
33d554a
36709a2
 
571f7e3
36709a2
571f7e3
 
33d554a
da60cd1
33d554a
36709a2
 
33d554a
 
 
36709a2
33d554a
 
36709a2
33d554a
 
 
 
 
 
 
 
 
 
 
 
 
36709a2
 
33d554a
36709a2
 
 
33d554a
36709a2
33d554a
36709a2
 
33d554a
36709a2
33d554a
36709a2
 
33d554a
 
 
 
36709a2
33d554a
 
36709a2
571f7e3
36709a2
33d554a
36709a2
33d554a
36709a2
33d554a
 
36709a2
33d554a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36709a2
33d554a
 
36709a2
 
 
33d554a
 
 
36709a2
 
33d554a
 
 
 
 
ca58a74
36709a2
571f7e3
ca58a74
 
 
33d554a
 
36709a2
ca58a74
 
33d554a
36709a2
33d554a
3cf27bd
36709a2
33d554a
3cf27bd
33d554a
da60cd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33d554a
 
36709a2
571f7e3
 
33d554a
 
 
36709a2
571f7e3
33d554a
36709a2
33d554a
36709a2
 
 
 
 
 
33d554a
571f7e3
36709a2
33d554a
52d93e9
36709a2
571f7e3
 
36709a2
33d554a
 
 
 
36709a2
33d554a
 
36709a2
33d554a
 
 
36709a2
33d554a
 
36709a2
 
33d554a
36709a2
33d554a
 
36709a2
33d554a
 
 
da60cd1
33d554a
 
 
 
 
 
 
 
 
 
 
36709a2
33d554a
36709a2
33d554a
 
36709a2
33d554a
36709a2
33d554a
 
36709a2
 
33d554a
36709a2
33d554a
 
 
 
 
 
 
36709a2
33d554a
 
 
36709a2
33d554a
 
 
 
571f7e3
 
36709a2
 
3cf27bd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
import gradio as gr
import markdown
from markdown.extensions.tables import TableExtension
from markdown.extensions.fenced_code import FencedCodeExtension
from markdown.extensions.toc import TocExtension
from markdown.extensions.attr_list import AttrListExtension
from markdown.extensions.codehilite import CodeHiliteExtension
import requests
from bs4 import BeautifulSoup

# For ReaderLM-2 functionality
from transformers import pipeline

# For ReaderLM-1 functionality
from transformers import AutoTokenizer, AutoModelForCausalLM
import spaces
import re
from markdownify import markdownify

######################################
# 1) MARKDOWN-STUDIO FUNCTIONALITY   #
######################################
def render_markdown(md_text):
    """
    Render a string of Markdown text into HTML using various Markdown extensions.
    - Supports tables, fenced code blocks, TOC, attribute lists, and syntax highlighting.
    """
    print("Rendering markdown input to HTML...")  # Debug log
    return markdown.markdown(
        md_text,
        extensions=[
            TableExtension(),  # Adds support for Markdown tables
            FencedCodeExtension(),  # Allows for fenced code blocks
            TocExtension(baselevel=2),  # Generates a Table of Contents starting at level 2
            AttrListExtension(),  # Enables attribute lists for elements
            CodeHiliteExtension(linenums=False, css_class="highlight"),  # Syntax highlighting for code blocks
        ],
    )

######################################
# 2) VIEW SOURCE INFO FUNCTIONALITY  #
######################################
def view_source_info(url):
    """
    Fetch the HTML source of the given URL.
    - Supports `view-source:` prefix or plain URLs.
    """
    if url.startswith("view-source:"):
        url = url.replace("view-source:", "").strip()
    if not url.startswith(("http://", "https://")):
        url = "https://" + url
    print(f"Fetching source for URL: {url}...")  # Debug log
    response = requests.get(url)
    return response.text

######################################
# 3) READERLM-2 FUNCTIONALITY        #
######################################
# Load the JinaAI ReaderLM-v2 model
model_name = "jinaai/ReaderLM-v2"
print(f"Loading model: {model_name}...")  # Debug log
html_converter = pipeline("text-generation", model=model_name)

def convert_html(html_input, output_format, custom_prompt=None):
    """
    Use the JinaAI ReaderLM-v2 pipeline to convert HTML into Markdown or JSON.
    - Takes raw HTML as input and converts it to the specified output format.
    - Allows for a custom system prompt.
    """
    if custom_prompt:
        prompt = f"{custom_prompt}\n\n{html_input}"
    else:
        prompt = f"Convert the following HTML into {output_format}:\n\n{html_input}"

    print(f"Converting HTML to {output_format} with prompt: {custom_prompt or 'default'}...")  # Debug log
    print(f"HTML input: {html_input[:100]}...")  # Debug log, preview first 100 characters of input

    # Use the pipeline to generate the conversion
    response = html_converter(prompt, max_length=9999, num_return_sequences=1)
    converted_output = response[0]['generated_text']

    # Remove the prompt from the output to clean up the response
    converted_output = converted_output.replace(prompt, "").strip()
    print("Conversion completed.")  # Debug log
    return converted_output

######################################
# 4) READERLM-1 FUNCTIONALITY        #
######################################
# Prepare models and tokenizers for ReaderLM-1
print("Loading ReaderLM-1 models and tokenizers...")  # Debug log
models = {
    "jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained(
        "jinaai/reader-lm-0.5b", trust_remote_code=True
    ).eval().to("cuda"),  # Load the smaller 0.5b model onto the GPU
    "jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained(
        "jinaai/reader-lm-1.5b", trust_remote_code=True
    ).eval().to("cuda"),  # Load the larger 1.5b model onto the GPU
}
tokenizers = {
    "jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained(
        "jinaai/reader-lm-0.5b", trust_remote_code=True
    ),
    "jinaai/reader-lm-1.5b": AutoTokenizer.from_pretrained(
        "jinaai/reader-lm-1.5b", trust_remote_code=True
    ),
}

@spaces.GPU
def run_example(html_content, model_id="jinaai/reader-lm-1.5b"):
    """
    Generate Markdown from HTML using ReaderLM (0.5b or 1.5b) models.
    - Includes both model-based generation and a rule-based markdownify output.
    """
    print(f"Running example with model: {model_id}...")  # Debug log
    model = models[model_id]  # Select the model based on the input ID
    tokenizer = tokenizers[model_id]  # Retrieve the corresponding tokenizer

    # Construct the chat-based input for the model
    messages = [{"role": "user", "content": html_content}]
    input_text = tokenizer.apply_chat_template(messages, tokenize=False)  # Format input text for the model
    print(f"Generated input text for model: {input_text[:100]}...")  # Debug log, preview input text

    # Tokenize the input text
    inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")

    # Generate output using the model
    outputs = model.generate(
        inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08
    )

    # Extract the assistant's response from the generated output
    pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>"
    assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL)
    print("Model generation completed.")  # Debug log

    # Use markdownify as a rule-based fallback for comparison
    markdownify_output = markdownify(html_content)
    print("Rule-based markdownify output generated.")  # Debug log

    # Return both model-based and rule-based outputs
    return assistant_response[0], markdownify_output

# Example HTML for ReaderLM-1
example_html = """<div id="myDIV" class="header">
  <h2>My To Do List</h2>
  <input type="text" id="myInput" placeholder="Title...">
  <span onclick="newElement()" class="addBtn">Add</span>
</div>

<ul id="myUL">
  <li>Hit the gym</li>
  <li class="checked">Pay bills</li>
  <li>Meet George</li>
  <li>Buy eggs</li>
  <li>Read a book</li>
  <li>Organize office</li>
</ul>"""

########################################################
# Combine everything into a single Gradio Blocks app   #
########################################################

# Optional extra CSS for styling the ReaderLM-1 tab
css = """
#output {
    height: 500px;  # Set the height of the output box
    overflow: auto;  # Enable scrolling for large content
    border: 1px solid #ccc;  # Add a border around the box
}
"""

# Initialize the Gradio app with the Nymbo/Nymbo_Theme for styling
print("Initializing Gradio app...")  # Debug log
with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:

    ########################################################
    # TAB 1: Markdown Suite (live preview)
    ########################################################
    with gr.Tab("Live Preview"):
        gr.Markdown("# Markdown Suite")  # Add a title for the tab

        with gr.Row():
            with gr.Column():
                md_input = gr.Textbox(
                    lines=20,
                    placeholder="Write your markdown here...",
                    label="Markdown Input",  # Input for Markdown text
                )
            with gr.Column():
                md_output = gr.HTML(
                    label="Rendered Output"  # Display the rendered HTML output
                )

        # Update the output whenever the input changes
        md_input.change(fn=render_markdown, inputs=md_input, outputs=md_output)

    ########################################################
    # TAB 2: View Source Info
    ########################################################
    with gr.Tab("View Source Info"):
        gr.Markdown("## View HTML Source Code")  # Tab description

        source_input = gr.Textbox(
            label="Enter URL (with or without 'view-source:')",
            placeholder="e.g., https://example.com or view-source:example.com"
        )
        source_output = gr.Textbox(
            label="HTML Source Code",
            lines=20
        )
        source_button = gr.Button("Fetch Source")  # Button to fetch source

        source_button.click(
            fn=view_source_info,
            inputs=source_input,
            outputs=source_output
        )

    ########################################################
    # TAB 3: ReaderLM-2 Converter (HTML → Markdown/JSON)
    ########################################################
    with gr.Tab("ReaderLM-2 Converter"):
        gr.Markdown("## HTML to Markdown/JSON Converter (ReaderLM-v2)")  # Tab description

        with gr.Row():
            html_input_2 = gr.Textbox(
                lines=10,
                placeholder="Paste your raw HTML here...",
                label="Raw HTML Input"  # Input for raw HTML
            )
            output_format_2 = gr.Radio(
                ["Markdown", "JSON"],  # Choose the output format
                label="Output Format",
                value="Markdown"  # Default to Markdown output
            )
            custom_prompt_2 = gr.Textbox(
                lines=2,
                placeholder="Optional: Enter a custom prompt...",
                label="Custom System Prompt"
            )

        convert_btn_2 = gr.Button("Convert")  # Button to trigger conversion
        converted_output_2 = gr.Textbox(
            lines=20,
            label="Converted Output"  # Display the converted output
        )

        # Provide usage details for the converter
        gr.Markdown(
            "Convert raw HTML into formatted Markdown or JSON using **JinaAI ReaderLM-v2**."
        )

        # Connect the button click event to the conversion function
        convert_btn_2.click(
            fn=convert_html,
            inputs=[html_input_2, output_format_2, custom_prompt_2],
            outputs=converted_output_2
        )

        # Add example inputs for demonstration
        gr.Examples(
            examples=[
                ["<h1>Hello World</h1><p>This is a <strong>test</strong>.</p>", "Markdown", "Optional custom prompt"],
                ["<ul><li>Item 1</li><li>Item 2</li></ul>", "JSON", "Optional custom prompt"]
            ],
            inputs=[html_input_2, output_format_2, custom_prompt_2],
            outputs=converted_output_2,
            fn=convert_html,
            cache_examples=False  # Disable caching for dynamic examples
        )

    ########################################################
    # TAB 4: ReaderLM-1 HTML-to-Markdown
    ########################################################
    with gr.Tab("ReaderLM-1 Converter"):
        gr.Markdown("""
        # HTML-to-Markdown with ReaderLM-1
        Use either **jinaai/reader-lm-0.5b** or **jinaai/reader-lm-1.5b** 
        to convert HTML to Markdown. Compare against rule-based `markdownify`.
        """)

        with gr.Row():
            with gr.Column():
                model_selector = gr.Dropdown(
                    choices=list(models.keys()),  # Allow selection between the two models
                    label="Model",
                    value="jinaai/reader-lm-1.5b"  # Default to the larger model
                )
                html_content = gr.Textbox(
                    label="HTML"  # Input for raw HTML
                )
                submit_btn = gr.Button(value="Submit")  # Button to trigger the model

            with gr.Column():
                model_output_text = gr.Textbox(label="Reader LM Output")  # Model-generated Markdown
                markdownify_output = gr.Textbox(label="Markdownify Output")  # Rule-based Markdown

        # Add example HTML input for demonstration
        gr.Examples(
            examples=[
                [example_html],
            ],
            inputs=[html_content],
            outputs=[model_output_text, markdownify_output],
            fn=run_example,
            cache_examples=True,  # Cache example outputs
            label="Try example HTML"
        )

        # Connect the submit button to the run_example function
        submit_btn.click(
            fn=run_example,
            inputs=[html_content, model_selector],
            outputs=[model_output_text, markdownify_output]
        )

# Finally, launch the combined demo app
print("Launching the demo...")  # Debug log
demo.launch()