Spaces:

Nymbo
/

Markdown-Studio

Paused

File size: 12,541 Bytes

import gradio as gr
import markdown
from markdown.extensions.tables import TableExtension
from markdown.extensions.fenced_code import FencedCodeExtension
from markdown.extensions.toc import TocExtension
from markdown.extensions.attr_list import AttrListExtension
from markdown.extensions.codehilite import CodeHiliteExtension
import requests
from bs4 import BeautifulSoup

# For ReaderLM-2 functionality
from transformers import pipeline

# For ReaderLM-1 functionality
from transformers import AutoTokenizer, AutoModelForCausalLM
import spaces
import re
from markdownify import markdownify

######################################
# 1) MARKDOWN-STUDIO FUNCTIONALITY   #
######################################
def render_markdown(md_text):
    """
    Render a string of Markdown text into HTML using various Markdown extensions.
    - Supports tables, fenced code blocks, TOC, attribute lists, and syntax highlighting.
    """
    print("Rendering markdown input to HTML...")  # Debug log
    return markdown.markdown(
        md_text,
        extensions=[
            TableExtension(),  # Adds support for Markdown tables
            FencedCodeExtension(),  # Allows for fenced code blocks
            TocExtension(baselevel=2),  # Generates a Table of Contents starting at level 2
            AttrListExtension(),  # Enables attribute lists for elements
            CodeHiliteExtension(linenums=False, css_class="highlight"),  # Syntax highlighting for code blocks
        ],
    )

######################################
# 2) VIEW SOURCE INFO FUNCTIONALITY  #
######################################
def view_source_info(url):
    """
    Fetch the HTML source of the given URL.
    - Supports `view-source:` prefix or plain URLs.
    """
    if url.startswith("view-source:"):
        url = url.replace("view-source:", "").strip()
    if not url.startswith(("http://", "https://")):
        url = "https://" + url
    print(f"Fetching source for URL: {url}...")  # Debug log
    response = requests.get(url)
    return response.text

######################################
# 3) READERLM-2 FUNCTIONALITY        #
######################################
# Load the JinaAI ReaderLM-v2 model
model_name = "jinaai/ReaderLM-v2"
print(f"Loading model: {model_name}...")  # Debug log
html_converter = pipeline("text-generation", model=model_name)

def convert_html(html_input, output_format, custom_prompt=None):
    """
    Use the JinaAI ReaderLM-v2 pipeline to convert HTML into Markdown or JSON.
    - Takes raw HTML as input and converts it to the specified output format.
    - Allows for a custom system prompt.
    """
    if custom_prompt:
        prompt = f"{custom_prompt}\n\n{html_input}"
    else:
        prompt = f"Convert the following HTML into {output_format}:\n\n{html_input}"

    print(f"Converting HTML to {output_format} with prompt: {custom_prompt or 'default'}...")  # Debug log
    print(f"HTML input: {html_input[:100]}...")  # Debug log, preview first 100 characters of input

    # Use the pipeline to generate the conversion
    response = html_converter(prompt, max_length=9999, num_return_sequences=1)
    converted_output = response[0]['generated_text']

    # Remove the prompt from the output to clean up the response
    converted_output = converted_output.replace(prompt, "").strip()
    print("Conversion completed.")  # Debug log
    return converted_output

######################################
# 4) READERLM-1 FUNCTIONALITY        #
######################################
# Prepare models and tokenizers for ReaderLM-1
print("Loading ReaderLM-1 models and tokenizers...")  # Debug log
models = {
    "jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained(
        "jinaai/reader-lm-0.5b", trust_remote_code=True
    ).eval().to("cuda"),  # Load the smaller 0.5b model onto the GPU
    "jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained(
        "jinaai/reader-lm-1.5b", trust_remote_code=True
    ).eval().to("cuda"),  # Load the larger 1.5b model onto the GPU
}
tokenizers = {
    "jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained(
        "jinaai/reader-lm-0.5b", trust_remote_code=True
    ),
    "jinaai/reader-lm-1.5b": AutoTokenizer.from_pretrained(
        "jinaai/reader-lm-1.5b", trust_remote_code=True
    ),
}

@spaces.GPU
def run_example(html_content, model_id="jinaai/reader-lm-1.5b"):
    """
    Generate Markdown from HTML using ReaderLM (0.5b or 1.5b) models.
    - Includes both model-based generation and a rule-based markdownify output.
    """
    print(f"Running example with model: {model_id}...")  # Debug log
    model = models[model_id]  # Select the model based on the input ID
    tokenizer = tokenizers[model_id]  # Retrieve the corresponding tokenizer

    # Construct the chat-based input for the model
    messages = [{"role": "user", "content": html_content}]
    input_text = tokenizer.apply_chat_template(messages, tokenize=False)  # Format input text for the model
    print(f"Generated input text for model: {input_text[:100]}...")  # Debug log, preview input text

    # Tokenize the input text
    inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")

    # Generate output using the model
    outputs = model.generate(
        inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08
    )

    # Extract the assistant's response from the generated output
    pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>"
    assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL)
    print("Model generation completed.")  # Debug log

    # Use markdownify as a rule-based fallback for comparison
    markdownify_output = markdownify(html_content)
    print("Rule-based markdownify output generated.")  # Debug log

    # Return both model-based and rule-based outputs
    return assistant_response[0], markdownify_output

# Example HTML for ReaderLM-1
example_html = """<div id="myDIV" class="header">
  <h2>My To Do List</h2>
  <input type="text" id="myInput" placeholder="Title...">
  <span onclick="newElement()" class="addBtn">Add</span>
</div>

<ul id="myUL">
  <li>Hit the gym</li>
  <li class="checked">Pay bills</li>
  <li>Meet George</li>
  <li>Buy eggs</li>
  <li>Read a book</li>
  <li>Organize office</li>
</ul>"""

########################################################
# Combine everything into a single Gradio Blocks app   #
########################################################

# Optional extra CSS for styling the ReaderLM-1 tab
css = """
#output {
    height: 500px;  # Set the height of the output box
    overflow: auto;  # Enable scrolling for large content
    border: 1px solid #ccc;  # Add a border around the box
}
"""

# Initialize the Gradio app with the Nymbo/Nymbo_Theme for styling
print("Initializing Gradio app...")  # Debug log
with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:

    ########################################################
    # TAB 1: Markdown Suite (live preview)
    ########################################################
    with gr.Tab("Live Preview"):
        gr.Markdown("# Markdown Suite")  # Add a title for the tab

        with gr.Row():
            with gr.Column():
                md_input = gr.Textbox(
                    lines=20,
                    placeholder="Write your markdown here...",
                    label="Markdown Input",  # Input for Markdown text
                )
            with gr.Column():
                md_output = gr.HTML(
                    label="Rendered Output"  # Display the rendered HTML output
                )

        # Update the output whenever the input changes
        md_input.change(fn=render_markdown, inputs=md_input, outputs=md_output)

    ########################################################
    # TAB 2: View Source Info
    ########################################################
    with gr.Tab("View Source Info"):
        gr.Markdown("## View HTML Source Code")  # Tab description

        source_input = gr.Textbox(
            label="Enter URL (with or without 'view-source:')",
            placeholder="e.g., https://example.com or view-source:example.com"
        )
        source_output = gr.Textbox(
            label="HTML Source Code",
            lines=20
        )
        source_button = gr.Button("Fetch Source")  # Button to fetch source

        source_button.click(
            fn=view_source_info,
            inputs=source_input,
            outputs=source_output
        )

    ########################################################
    # TAB 3: ReaderLM-2 Converter (HTML → Markdown/JSON)
    ########################################################
    with gr.Tab("ReaderLM-2 Converter"):
        gr.Markdown("## HTML to Markdown/JSON Converter (ReaderLM-v2)")  # Tab description

        with gr.Row():
            html_input_2 = gr.Textbox(
                lines=10,
                placeholder="Paste your raw HTML here...",
                label="Raw HTML Input"  # Input for raw HTML
            )
            output_format_2 = gr.Radio(
                ["Markdown", "JSON"],  # Choose the output format
                label="Output Format",
                value="Markdown"  # Default to Markdown output
            )
            custom_prompt_2 = gr.Textbox(
                lines=2,
                placeholder="Optional: Enter a custom prompt...",
                label="Custom System Prompt"
            )

        convert_btn_2 = gr.Button("Convert")  # Button to trigger conversion
        converted_output_2 = gr.Textbox(
            lines=20,
            label="Converted Output"  # Display the converted output
        )

        # Provide usage details for the converter
        gr.Markdown(
            "Convert raw HTML into formatted Markdown or JSON using **JinaAI ReaderLM-v2**."
        )

        # Connect the button click event to the conversion function
        convert_btn_2.click(
            fn=convert_html,
            inputs=[html_input_2, output_format_2, custom_prompt_2],
            outputs=converted_output_2
        )

        # Add example inputs for demonstration
        gr.Examples(
            examples=[
                ["<h1>Hello World</h1><p>This is a <strong>test</strong>.</p>", "Markdown", "Optional custom prompt"],
                ["<ul><li>Item 1</li><li>Item 2</li></ul>", "JSON", "Optional custom prompt"]
            ],
            inputs=[html_input_2, output_format_2, custom_prompt_2],
            outputs=converted_output_2,
            fn=convert_html,
            cache_examples=False  # Disable caching for dynamic examples
        )

    ########################################################
    # TAB 4: ReaderLM-1 HTML-to-Markdown
    ########################################################
    with gr.Tab("ReaderLM-1 Converter"):
        gr.Markdown("""
        # HTML-to-Markdown with ReaderLM-1
        Use either **jinaai/reader-lm-0.5b** or **jinaai/reader-lm-1.5b** 
        to convert HTML to Markdown. Compare against rule-based `markdownify`.
        """)

        with gr.Row():
            with gr.Column():
                model_selector = gr.Dropdown(
                    choices=list(models.keys()),  # Allow selection between the two models
                    label="Model",
                    value="jinaai/reader-lm-1.5b"  # Default to the larger model
                )
                html_content = gr.Textbox(
                    label="HTML"  # Input for raw HTML
                )
                submit_btn = gr.Button(value="Submit")  # Button to trigger the model

            with gr.Column():
                model_output_text = gr.Textbox(label="Reader LM Output")  # Model-generated Markdown
                markdownify_output = gr.Textbox(label="Markdownify Output")  # Rule-based Markdown

        # Add example HTML input for demonstration
        gr.Examples(
            examples=[
                [example_html],
            ],
            inputs=[html_content],
            outputs=[model_output_text, markdownify_output],
            fn=run_example,
            cache_examples=True,  # Cache example outputs
            label="Try example HTML"
        )

        # Connect the submit button to the run_example function
        submit_btn.click(
            fn=run_example,
            inputs=[html_content, model_selector],
            outputs=[model_output_text, markdownify_output]
        )

# Finally, launch the combined demo app
print("Launching the demo...")  # Debug log
demo.launch()