File size: 3,781 Bytes
2cb5324
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
punctuation_dict = {
    ",": ",",
    "。": ".",
}
translation_table = str.maketrans(punctuation_dict)
stop_str = "<|im_end|>"


def svg_to_html(svg_content, output_filename):
    html_content = f"""
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>SVG Embedded in HTML</title>
    </head>
    <body>
        <svg width="2100" height="15000" xmlns="http://www.w3.org/2000/svg">
            {svg_content}
        </svg>
    </body>
    </html>
    """

    with open(output_filename, "w") as file:
        file.write(html_content)


def render_ocr_text(text, result_path, format_text=False):
    if text.endswith(stop_str):
        text = text[: -len(stop_str)]
    text = text.strip()

    if "**kern" in text:
        import verovio

        tk = verovio.toolkit()
        tk.loadData(text)
        tk.setOptions(
            {
                "pageWidth": 2100,
                "footer": "none",
                "barLineWidth": 0.5,
                "beamMaxSlope": 15,
                "staffLineWidth": 0.2,
                "spacingStaff": 6,
            }
        )
        tk.getPageCount()
        svg = tk.renderToSVG()
        svg = svg.replace('overflow="inherit"', 'overflow="visible"')

        svg_to_html(svg, result_path)

    if format_text and "**kern" not in text:
        if "\\begin{tikzpicture}" not in text:
            html_path = "./render_tools/" + "/content-mmd-to-html.html"
            right_num = text.count("\\right")
            left_num = text.count("\left")

            if right_num != left_num:
                text = (
                    text.replace("\left(", "(")
                    .replace("\\right)", ")")
                    .replace("\left[", "[")
                    .replace("\\right]", "]")
                    .replace("\left{", "{")
                    .replace("\\right}", "}")
                    .replace("\left|", "|")
                    .replace("\\right|", "|")
                    .replace("\left.", ".")
                    .replace("\\right.", ".")
                )

            text = text.replace('"', "``").replace("$", "")

            outputs_list = text.split("\n")
            gt = ""
            for out in outputs_list:
                gt += '"' + out.replace("\\", "\\\\") + r"\n" + '"' + "+" + "\n"

            gt = gt[:-2]

            with open(html_path, "r") as web_f:
                lines = web_f.read()
                lines = lines.split("const text =")
                new_web = lines[0] + "const text =" + gt + lines[1]
        else:
            html_path = "./render_tools/" + "/tikz.html"
            text = text.translate(translation_table)
            outputs_list = text.split("\n")
            gt = ""
            for out in outputs_list:
                if out:
                    if (
                        "\\begin{tikzpicture}" not in out
                        and "\\end{tikzpicture}" not in out
                    ):
                        while out[-1] == " ":
                            out = out[:-1]
                            if out is None:
                                break

                        if out:
                            if out[-1] != ";":
                                gt += out[:-1] + ";\n"
                            else:
                                gt += out + "\n"
                    else:
                        gt += out + "\n"

            with open(html_path, "r") as web_f:
                lines = web_f.read()
                lines = lines.split("const text =")
                new_web = lines[0] + gt + lines[1]

        with open(result_path, "w") as web_f_new:
            web_f_new.write(new_web)