File size: 2,658 Bytes
3943768
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import sys


def test_extract_xml_tags():
    xml_input = """
<doc>
<name>Zulu is hot..pdf</name>
<page>1</page>
<text>
Zulu is hot.
</text>
</doc>
"""

    from openai_server.backend_utils import extract_xml_tags
    name_page_dict = extract_xml_tags(xml_input)
    assert name_page_dict == {'name': 'Zulu is hot..pdf', 'page': '1'}

    from openai_server.backend_utils import generate_unique_filename
    filename, clean_name, page = generate_unique_filename(name_page_dict)
    assert (filename, clean_name, page) == ('Zulu_is_hot__page_1.txt', 'Zulu_is_hot_', '1')


def test_deduplicate_filenames():
    original_filenames = [
        "Zulu_is_hot__page_1.txt",
        "Zulu_is_hot__page_1.txt",
        "Zulu_is_hot__page_2.txt",
        "Another_document_page_1.txt",
        "Zulu_is_hot__page_1.txt"
    ]

    expected = [
        "Zulu_is_hot__page_1_chunk_0.txt",
        "Zulu_is_hot__page_1_chunk_1.txt",
        "Zulu_is_hot__page_2.txt",
        "Another_document_page_1.txt",
        "Zulu_is_hot__page_1_chunk_2.txt"
    ]

    from openai_server.backend_utils import deduplicate_filenames
    result = deduplicate_filenames(original_filenames)
    assert result == expected, f"Expected: {expected}, but got: {result}"


def test_generate_unique_filename_multiple_returns():
    meta_datas = [
        "<name>Zulu is hot..pdf</name>\n<page>1</page>",
        "<name>Missing page.pdf</name>",
        "<page>5</page>",
        "No XML tags here",
        ""
    ]

    from openai_server.backend_utils import generate_unique_filename
    from openai_server.backend_utils import extract_xml_tags
    results = [generate_unique_filename(extract_xml_tags(x)) for x in meta_datas]
    file_names, cleaned_names, pages = zip(*results)

    print("File names:", file_names)
    print("Cleaned names:", cleaned_names)
    print("Pages:", pages)

    # Assertions to verify the results
    assert len(file_names) == len(meta_datas)
    assert len(cleaned_names) == len(meta_datas)
    assert len(pages) == len(meta_datas)

    assert file_names[0] == "Zulu_is_hot__page_1.txt"
    assert cleaned_names[0] == "Zulu_is_hot_"
    assert pages[0] == "1"

    assert file_names[1].endswith("_page_0.txt")
    assert cleaned_names[1] == "Missing_page"
    assert pages[1] == "0"

    assert pages[2] == "5"
    assert file_names[3] == 'unknown_page_0.txt'
    assert file_names[4] == 'unknown_page_0.txt'


def test_exif():
    import pyexiv2
    img_file_one = 'tests/image_exif.jpg'
    with pyexiv2.Image(img_file_one) as img:
        metadata = img.read_exif()
    assert metadata is not None and metadata != {}
    print(metadata, file=sys.stderr)