Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -17,9 +17,8 @@ def length_tokens(txt):
|
|
17 |
|
18 |
|
19 |
def extract_separators_from_string(separators_str):
|
20 |
-
print('Received:', type(separators_str), 'with value', repr(separators_str))
|
21 |
try:
|
22 |
-
separators_str = separators_str.replace("\\n", "\n").replace("\\t", "\t") # fix special characters
|
23 |
separators = separators_str[1:-1].split(", ")
|
24 |
return [separator.replace('"', "").replace("'", "") for separator in separators]
|
25 |
except Exception as e:
|
@@ -47,7 +46,6 @@ def chunk(text, length, splitter_selection, separators_str, length_unit_selectio
|
|
47 |
separator=" ",
|
48 |
)
|
49 |
elif splitter_selection == LABEL_RECURSIVE:
|
50 |
-
print('Splitting with separators:', ',,'.join([repr(el) for el in separators]), f',and chunk length {length} and chunk overlap {chunk_overlap}')
|
51 |
text_splitter = RecursiveCharacterTextSplitter(
|
52 |
chunk_size=length,
|
53 |
chunk_overlap=int(chunk_overlap),
|
@@ -55,14 +53,9 @@ def chunk(text, length, splitter_selection, separators_str, length_unit_selectio
|
|
55 |
strip_whitespace=False,
|
56 |
separators=separators,
|
57 |
)
|
58 |
-
print(text_splitter._separators)
|
59 |
splits = text_splitter.create_documents([text])
|
60 |
text_splits = [split.page_content for split in splits]
|
61 |
-
print('I did splits:')
|
62 |
-
print(text_splits)
|
63 |
-
|
64 |
unoverlapped_text_splits = unoverlap_list(text_splits)
|
65 |
-
|
66 |
output = [((split[0], 'Overlap') if split[1] else (split[0], f"Chunk {str(i)}")) for i, split in enumerate(unoverlapped_text_splits)]
|
67 |
return output
|
68 |
|
|
|
17 |
|
18 |
|
19 |
def extract_separators_from_string(separators_str):
|
|
|
20 |
try:
|
21 |
+
separators_str = separators_str.replace("\\n", "\n").replace("\\t", "\t").replace("\\\\", "\\") # fix special characters
|
22 |
separators = separators_str[1:-1].split(", ")
|
23 |
return [separator.replace('"', "").replace("'", "") for separator in separators]
|
24 |
except Exception as e:
|
|
|
46 |
separator=" ",
|
47 |
)
|
48 |
elif splitter_selection == LABEL_RECURSIVE:
|
|
|
49 |
text_splitter = RecursiveCharacterTextSplitter(
|
50 |
chunk_size=length,
|
51 |
chunk_overlap=int(chunk_overlap),
|
|
|
53 |
strip_whitespace=False,
|
54 |
separators=separators,
|
55 |
)
|
|
|
56 |
splits = text_splitter.create_documents([text])
|
57 |
text_splits = [split.page_content for split in splits]
|
|
|
|
|
|
|
58 |
unoverlapped_text_splits = unoverlap_list(text_splits)
|
|
|
59 |
output = [((split[0], 'Overlap') if split[1] else (split[0], f"Chunk {str(i)}")) for i, split in enumerate(unoverlapped_text_splits)]
|
60 |
return output
|
61 |
|