Advertisement
karim0209

vtt2text.py

Feb 16th, 2023 (edited)
956
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.57 KB | None | 0 0
  1. """
  2. Convert YouTube subtitles(vtt) to human readable text.
  3.  
  4. Download only subtitles from YouTube with youtube-dl:
  5. youtube-dl  --skip-download --convert-subs vtt <video_url>
  6.  
  7. Note that default subtitle format provided by YouTube is ass, which is hard
  8. to process with simple regex. Luckily youtube-dl can convert ass to vtt, which
  9. is easier to process.
  10.  
  11. To conver all vtt files inside a directory:
  12. find . -name "*.vtt" -exec python vtt2text.py {} \;
  13. """
  14.  
  15. import sys
  16. import re
  17.  
  18.  
  19. def remove_tags(text):
  20.     """
  21.    Remove vtt markup tags
  22.    """
  23.     tags = [
  24.         r'</c>',
  25.         r'<c(\.color\w+)?>',
  26.         r'<\d{2}:\d{2}:\d{2}\.\d{3}>',
  27.  
  28.     ]
  29.  
  30.     for pat in tags:
  31.         text = re.sub(pat, '', text)
  32.  
  33.     # extract timestamp, only kep HH:MM
  34.     text = re.sub(
  35.         r'(\d{2}:\d{2}):\d{2}\.\d{3} --> .* align:start position:0%',
  36.         r'\g<1>',
  37.         text
  38.     )
  39.  
  40.     text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE)
  41.     return text
  42.  
  43. def remove_header(lines):
  44.     """
  45.    Remove vtt file header
  46.    """
  47.     pos = -1
  48.     for mark in ('##', 'Language: en',):
  49.         if mark in lines:
  50.             pos = lines.index(mark)
  51.     lines = lines[pos+1:]
  52.     return lines
  53.  
  54.  
  55. def merge_duplicates(lines):
  56.     """
  57.    Remove duplicated subtitles. Duplacates are always adjacent.
  58.    """
  59.     last_timestamp = ''
  60.     last_cap = ''
  61.     for line in lines:
  62.         if line == "":
  63.             continue
  64.         if re.match('^\d{2}:\d{2}$', line):
  65.             if line != last_timestamp:
  66.                 yield line
  67.                 last_timestamp = line
  68.         else:
  69.             if line != last_cap:
  70.                 yield line
  71.                 last_cap = line
  72.  
  73.  
  74. def merge_short_lines(lines):
  75.     buffer = ''
  76.     for line in lines:
  77.         if line == "" or re.match('^\d{2}:\d{2}$', line):
  78.             yield '\n' + line
  79.             continue
  80.  
  81.         if len(line+buffer) < 80:
  82.             buffer += ' ' + line
  83.         else:
  84.             yield buffer.strip()
  85.             buffer = line
  86.     yield buffer
  87.  
  88.  
  89. def main():
  90.     vtt_file_name = sys.argv[1]
  91.     txt_name =  re.sub(r'.vtt$', '.txt', vtt_file_name)
  92.     with open(vtt_file_name) as f:
  93.         text = f.read()
  94.     text = remove_tags(text)
  95.     lines = text.splitlines()
  96.     lines = remove_header(lines)
  97.     lines = merge_duplicates(lines)
  98.     lines = list(lines)
  99.     lines = merge_short_lines(lines)
  100.     lines = list(lines)
  101.  
  102.     with open(txt_name, 'w') as f:
  103.         for line in lines:
  104.             f.write(line)
  105.             f.write("\n")
  106.  
  107.  
  108.  
  109. if __name__ == "__main__":
  110.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement