Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -74,6 +74,13 @@ class TTSDatasetCollector:
|
|
74 |
self.current_index = 0
|
75 |
self.current_font = "english_serif"
|
76 |
self.setup_directories()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
logger.info("TTS Dataset Collector initialized")
|
78 |
|
79 |
def setup_directories(self) -> None:
|
@@ -122,8 +129,39 @@ class TTSDatasetCollector:
|
|
122 |
if not text.strip():
|
123 |
return False, "Text is empty"
|
124 |
|
125 |
-
#
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
if not self.sentences:
|
128 |
return False, "No valid sentences found in text"
|
129 |
|
|
|
74 |
self.current_index = 0
|
75 |
self.current_font = "english_serif"
|
76 |
self.setup_directories()
|
77 |
+
|
78 |
+
# Ensure NLTK data is downloaded
|
79 |
+
try:
|
80 |
+
nltk.data.find('tokenizers/punkt')
|
81 |
+
except LookupError:
|
82 |
+
nltk.download('punkt', quiet=True)
|
83 |
+
|
84 |
logger.info("TTS Dataset Collector initialized")
|
85 |
|
86 |
def setup_directories(self) -> None:
|
|
|
129 |
if not text.strip():
|
130 |
return False, "Text is empty"
|
131 |
|
132 |
+
# Simple sentence splitting as fallback
|
133 |
+
def simple_split_sentences(text):
|
134 |
+
# Split on common sentence endings
|
135 |
+
sentences = []
|
136 |
+
current = []
|
137 |
+
|
138 |
+
for line in text.split('\n'):
|
139 |
+
line = line.strip()
|
140 |
+
if not line:
|
141 |
+
continue
|
142 |
+
|
143 |
+
# Split on common sentence endings
|
144 |
+
parts = line.replace('!', '.').replace('?', '.').split('.')
|
145 |
+
for part in parts:
|
146 |
+
part = part.strip()
|
147 |
+
if part:
|
148 |
+
current.append(part)
|
149 |
+
sentences.append(' '.join(current))
|
150 |
+
current = []
|
151 |
+
|
152 |
+
if current:
|
153 |
+
sentences.append(' '.join(current))
|
154 |
+
|
155 |
+
return [s.strip() for s in sentences if s.strip()]
|
156 |
+
|
157 |
+
try:
|
158 |
+
# Try NLTK first
|
159 |
+
self.sentences = nltk.sent_tokenize(text.strip())
|
160 |
+
except Exception as e:
|
161 |
+
logger.warning(f"NLTK tokenization failed, falling back to simple splitting: {str(e)}")
|
162 |
+
# Fallback to simple splitting
|
163 |
+
self.sentences = simple_split_sentences(text.strip())
|
164 |
+
|
165 |
if not self.sentences:
|
166 |
return False, "No valid sentences found in text"
|
167 |
|