Spaces:
Running
Running
Joshua Lochner
commited on
Commit
·
0b48a99
1
Parent(s):
dbf7b4c
Improve exceptions thrown while obtaining transcripts
Browse files- src/preprocess.py +7 -4
src/preprocess.py
CHANGED
@@ -81,6 +81,7 @@ PROFANITY_RAW = '[ __ ]' # How YouTube transcribes profanity
|
|
81 |
PROFANITY_CONVERTED = '*****' # Safer version for tokenizing
|
82 |
|
83 |
|
|
|
84 |
def get_auto_words(transcript_list):
|
85 |
words = []
|
86 |
transcript = transcript_list.find_generated_transcript(['en'])
|
@@ -139,13 +140,15 @@ def get_words(video_id, process=True, transcript_type='auto', fallback='manual',
|
|
139 |
else:
|
140 |
words = get_auto_words(transcript_list)
|
141 |
|
142 |
-
except (TooManyRequests, YouTubeRequestFailed
|
143 |
-
|
|
|
|
|
144 |
time.sleep(10) # Timeout
|
145 |
return get_words(video_id, process, transcript_type, fallback)
|
146 |
|
147 |
-
except CouldNotRetrieveTranscript:
|
148 |
-
pass
|
149 |
|
150 |
except json.decoder.JSONDecodeError:
|
151 |
print('JSONDecodeError for', video_id)
|
|
|
81 |
PROFANITY_CONVERTED = '*****' # Safer version for tokenizing
|
82 |
|
83 |
|
84 |
+
# TODO add end time for words
|
85 |
def get_auto_words(transcript_list):
|
86 |
words = []
|
87 |
transcript = transcript_list.find_generated_transcript(['en'])
|
|
|
140 |
else:
|
141 |
words = get_auto_words(transcript_list)
|
142 |
|
143 |
+
except (TooManyRequests, YouTubeRequestFailed):
|
144 |
+
raise # Cannot recover from these errors and do not mark as empty transcript
|
145 |
+
|
146 |
+
except requests.exceptions.ConnectionError: # Can recover
|
147 |
time.sleep(10) # Timeout
|
148 |
return get_words(video_id, process, transcript_type, fallback)
|
149 |
|
150 |
+
except CouldNotRetrieveTranscript: # Retrying won't solve
|
151 |
+
pass # Mark as empty transcript
|
152 |
|
153 |
except json.decoder.JSONDecodeError:
|
154 |
print('JSONDecodeError for', video_id)
|