Joshua Lochner commited on
Commit
0b48a99
·
1 Parent(s): dbf7b4c

Improve exceptions thrown while obtaining transcripts

Browse files
Files changed (1) hide show
  1. src/preprocess.py +7 -4
src/preprocess.py CHANGED
@@ -81,6 +81,7 @@ PROFANITY_RAW = '[ __ ]' # How YouTube transcribes profanity
81
  PROFANITY_CONVERTED = '*****' # Safer version for tokenizing
82
 
83
 
 
84
  def get_auto_words(transcript_list):
85
  words = []
86
  transcript = transcript_list.find_generated_transcript(['en'])
@@ -139,13 +140,15 @@ def get_words(video_id, process=True, transcript_type='auto', fallback='manual',
139
  else:
140
  words = get_auto_words(transcript_list)
141
 
142
- except (TooManyRequests, YouTubeRequestFailed, requests.exceptions.ConnectionError) as e: # Can retry
143
- print(e)
 
 
144
  time.sleep(10) # Timeout
145
  return get_words(video_id, process, transcript_type, fallback)
146
 
147
- except CouldNotRetrieveTranscript:
148
- pass
149
 
150
  except json.decoder.JSONDecodeError:
151
  print('JSONDecodeError for', video_id)
 
81
  PROFANITY_CONVERTED = '*****' # Safer version for tokenizing
82
 
83
 
84
+ # TODO add end time for words
85
  def get_auto_words(transcript_list):
86
  words = []
87
  transcript = transcript_list.find_generated_transcript(['en'])
 
140
  else:
141
  words = get_auto_words(transcript_list)
142
 
143
+ except (TooManyRequests, YouTubeRequestFailed):
144
+ raise # Cannot recover from these errors and do not mark as empty transcript
145
+
146
+ except requests.exceptions.ConnectionError: # Can recover
147
  time.sleep(10) # Timeout
148
  return get_words(video_id, process, transcript_type, fallback)
149
 
150
+ except CouldNotRetrieveTranscript: # Retrying won't solve
151
+ pass # Mark as empty transcript
152
 
153
  except json.decoder.JSONDecodeError:
154
  print('JSONDecodeError for', video_id)