Omarrran commited on
Commit
4ffe582
·
verified ·
1 Parent(s): 14f7424

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -2
app.py CHANGED
@@ -74,6 +74,13 @@ class TTSDatasetCollector:
74
  self.current_index = 0
75
  self.current_font = "english_serif"
76
  self.setup_directories()
 
 
 
 
 
 
 
77
  logger.info("TTS Dataset Collector initialized")
78
 
79
  def setup_directories(self) -> None:
@@ -122,8 +129,39 @@ class TTSDatasetCollector:
122
  if not text.strip():
123
  return False, "Text is empty"
124
 
125
- # Tokenize sentences
126
- self.sentences = nltk.sent_tokenize(text.strip())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  if not self.sentences:
128
  return False, "No valid sentences found in text"
129
 
 
74
  self.current_index = 0
75
  self.current_font = "english_serif"
76
  self.setup_directories()
77
+
78
+ # Ensure NLTK data is downloaded
79
+ try:
80
+ nltk.data.find('tokenizers/punkt')
81
+ except LookupError:
82
+ nltk.download('punkt', quiet=True)
83
+
84
  logger.info("TTS Dataset Collector initialized")
85
 
86
  def setup_directories(self) -> None:
 
129
  if not text.strip():
130
  return False, "Text is empty"
131
 
132
+ # Simple sentence splitting as fallback
133
+ def simple_split_sentences(text):
134
+ # Split on common sentence endings
135
+ sentences = []
136
+ current = []
137
+
138
+ for line in text.split('\n'):
139
+ line = line.strip()
140
+ if not line:
141
+ continue
142
+
143
+ # Split on common sentence endings
144
+ parts = line.replace('!', '.').replace('?', '.').split('.')
145
+ for part in parts:
146
+ part = part.strip()
147
+ if part:
148
+ current.append(part)
149
+ sentences.append(' '.join(current))
150
+ current = []
151
+
152
+ if current:
153
+ sentences.append(' '.join(current))
154
+
155
+ return [s.strip() for s in sentences if s.strip()]
156
+
157
+ try:
158
+ # Try NLTK first
159
+ self.sentences = nltk.sent_tokenize(text.strip())
160
+ except Exception as e:
161
+ logger.warning(f"NLTK tokenization failed, falling back to simple splitting: {str(e)}")
162
+ # Fallback to simple splitting
163
+ self.sentences = simple_split_sentences(text.strip())
164
+
165
  if not self.sentences:
166
  return False, "No valid sentences found in text"
167