Advertisement
arfin97

Youtube Transcript Cleaner.py

Nov 13th, 2018
130
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.78 KB | None | 0 0
  1. import re
  2. import sys
  3. # Read and Write in python
  4. regex_pattern1 = r'<p.*>'   # Do not delete 'r'.
  5. regex_pattern2 = r'</p>'
  6. regex_pattern3 = r'\n+'
  7. regex_pattern4 = r'<body>|</body>|<timedtext.*>|</timedtext>'
  8. regex_pattern5 = r'^This XML.*.'
  9. regex_pattern6 = r'\n'
  10. regex_pattern7 = r'\.'
  11.  
  12. with open('text.txt', 'r') as rf:
  13.     file_contents = rf.read();
  14.     # print(file_contents)
  15.     match = re.sub(regex_pattern1, "", file_contents)
  16.     match = re.sub(regex_pattern2, "", match)
  17.     match = re.sub(regex_pattern4, "", match)
  18.     match = re.sub(regex_pattern5, "", match)
  19.     match = re.sub(regex_pattern3, "\n\n", match)
  20.     # match = re.sub(regex_pattern6, "", match)
  21.     # match = re.sub(regex_pattern7, ".\n", match)
  22.  
  23.     print(match, end='')
  24.  
  25.     with open('text2.txt', 'w') as wf:
  26.         wf.write(match)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement