File size: 9,250 Bytes
ccd18dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b4707c1
ccd18dc
 
 
 
 
 
3e041fc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
import spaces
import gradio as gr
import json
import os
from pathlib import Path
import logging
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat, DocumentStream
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
from docling.document_converter import PdfFormatOption
import requests
from urllib.parse import urlparse
from datetime import datetime
import tempfile
from docx import Document
from docx.shared import Inches
import markdown

# Set up logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except:
        return False

def markdown_to_docx(markdown_content):
    """Convert markdown content to DOCX format"""
    doc = Document()
    
    # Split content into lines
    lines = markdown_content.split('\n')
    
    for line in lines:
        # Handle headers
        if line.startswith('# '):
            doc.add_heading(line[2:], level=1)
        elif line.startswith('## '):
            doc.add_heading(line[3:], level=2)
        elif line.startswith('### '):
            doc.add_heading(line[4:], level=3)
        # Handle lists
        elif line.startswith('* ') or line.startswith('- '):
            doc.add_paragraph(line[2:], style='List Bullet')
        elif line.startswith('1. '):
            doc.add_paragraph(line[3:], style='List Number')
        # Handle normal text
        elif line.strip():
            doc.add_paragraph(line)
        # Handle empty lines
        else:
            doc.add_paragraph()
    
    return doc

def create_output_files(content, original_name):
    """Create temporary files for different formats and return their paths"""
    files = {}
    
    # Generate base filename
    base_name = Path(original_name).stem
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Create markdown file
    md_path = tempfile.NamedTemporaryFile(delete=False, suffix='.md').name
    with open(md_path, "w", encoding="utf-8") as f:
        f.write(content)
    files['markdown'] = md_path
    
    # Create JSON file
    json_content = {
        "title": original_name,
        "content": content,
        "metadata": {
            "conversion_date": datetime.now().isoformat()
        }
    }
    json_path = tempfile.NamedTemporaryFile(delete=False, suffix='.json').name
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(json_content, f, ensure_ascii=False, indent=2)
    files['json'] = json_path
    
    # Create proper DOCX file
    docx_path = tempfile.NamedTemporaryFile(delete=False, suffix='.docx').name
    doc = markdown_to_docx(content)
    doc.save(docx_path)
    files['docx'] = docx_path
    
    return files

@spaces.GPU()
def process_document(input_type, file_input, url_input, use_gpu, table_mode):
    try:
        logger.debug(f"Processing with input type: {input_type}")
        logger.debug(f"File input: {file_input}")
        
        # Configure pipeline
        pipeline_options = PdfPipelineOptions(do_table_structure=True)
        if table_mode:
            pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
        else:
            pipeline_options.table_structure_options.mode = TableFormerMode.FAST
            
        converter = DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
            }
        )
        
        # Handle different input types
        if input_type == "file":
            if file_input is None:
                return None, None, None, None, "Please upload a file"
            source = file_input
            original_name = Path(file_input).name
        elif input_type == "url":
            if not url_input or not is_valid_url(url_input):
                return None, None, None, None, "Please enter a valid URL"
            source = url_input
            original_name = Path(urlparse(url_input).path).name or "url_document"
        else:
            return None, None, None, None, "Invalid input type"
            
        # Convert document
        logger.debug(f"Converting document: {source}")
        result = converter.convert(source)
        
        # Get markdown content
        markdown_content = result.document.export_to_markdown()
        
        # Create output files
        output_files = create_output_files(markdown_content, original_name)
        
        return (
            output_files['markdown'],
            output_files['json'],
            output_files['docx'],
            markdown_content,
            "Conversion completed successfully! Use the download buttons below to get your files."
        )
        
    except Exception as e:
        logger.exception("Error occurred during conversion")
        return None, None, None, None, f"Error during conversion: {str(e)}\nCheck the console for detailed error logs."

# Create title HTML with custom style and duplicate button CSS
title_html = """
<div style="text-align: center; max-width: 800px; margin: 0 auto;">
    <h1 style="color: #FFD700; font-size: 2.5rem; margin-bottom: 0.5rem;">Professional Document Converter</h1>
    <p style="color: #FFA500; font-size: 1.1rem; margin-bottom: 1.5rem;">Convert documents from files or URLs to various formats</p>
    <p style="color: #87CEEB; font-size: 0.9rem;">Please like this Space if you find it useful! Your support is appreciated 🙏</p>
</div>
<style>
.duplicate-button {
    margin: 0.5em auto 1em;
    display: block;
    background-color: #FFD700 !important;
    color: black !important;
    border: none !important;
    font-weight: bold !important;
}
.duplicate-button:hover {
    background-color: #FFA500 !important;
    transform: translateY(-2px);
    transition: all 0.2s ease;
}
</style>
"""

# Create Gradio interface with custom theme
with gr.Blocks(css="footer {display: none}") as demo:
    gr.HTML(title_html)
    
    # Add duplicate button at the top
    gr.DuplicateButton(
        value="Duplicate Space for private use",
        elem_classes="duplicate-button",
    )
    
    with gr.Row():
        with gr.Column(scale=1):
            input_type = gr.Radio(
                choices=["file", "url"],
                value="file",
                label="Input Type"
            )
            
            # File input with proper file type handling
            file_input = gr.File(
                label="Upload Document",
                file_types=[".pdf", ".PDF"],
                type="filepath"
            )
            
            # URL input
            url_input = gr.Textbox(
                label="Or Enter URL",
                placeholder="https://arxiv.org/pdf/2408.09869"
            )
            
            # Processing options
            use_gpu = gr.Checkbox(label="Use GPU", value=True)
            table_mode = gr.Checkbox(label="Use Accurate Table Mode (Slower but better)", value=False)
            
            convert_btn = gr.Button("Convert Document", variant="primary")
            
        with gr.Column(scale=2):
            # Status message
            status_message = gr.Markdown("")
            
            # Preview area
            preview = gr.Markdown("", label="Preview")
            
            # Download files
            with gr.Group() as download_group:
                gr.Markdown("### Download Files")
                with gr.Row():
                    markdown_output = gr.File(label="Download Markdown")
                    json_output = gr.File(label="Download JSON")
                    docx_output = gr.File(label="Download DOCX")

    # Define the main conversion event
    convert_btn.click(
        fn=process_document,
        inputs=[input_type, file_input, url_input, use_gpu, table_mode],
        outputs=[markdown_output, json_output, docx_output, preview, status_message]
    )
    
    # Updated footer with better visibility
    footer = """
    <div style="text-align: center; margin: 2rem auto; padding: 1rem; border-top: 1px solid #FFD700; max-width: 800px;">
        <div style="margin-bottom: 1rem;">
            <a href="https://www.linkedin.com/in/pejman-ebrahimi-4a60151a7/" target="_blank" style="text-decoration: none; color: #FFD700; margin: 0 10px;">LinkedIn</a> |
            <a href="https://github.com/arad1367" target="_blank" style="text-decoration: none; color: #FFD700; margin: 0 10px;">GitHub</a> |
            <a href="https://arad1367.pythonanywhere.com/" target="_blank" style="text-decoration: none; color: #FFD700; margin: 0 10px;">PhD Defense Demo</a> |
            <a href="https://github.com/DS4SD/docling" target="_blank" style="text-decoration: none; color: #FFD700; margin: 0 10px;">Docling Project</a>
        </div>
        <p style="color: #FFA500; margin-top: 0.5rem;">Made with 💖 by Pejman Ebrahimi</p>
    </div>
    """
    gr.HTML(footer)

# Launch the app
if __name__ == "__main__":
    demo.queue(max_size=5)  # Increased timeout to 120 seconds
    demo.launch(
        show_error=True,
        share=False,
        debug=True,
        show_api=False,
        server_name="0.0.0.0"
    )