Advertisement
desdemona

polish wiki scrapper

May 24th, 2016
537
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.45 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. import sys
  4. import uuid
  5. import string
  6. import re
  7.  
  8. from urllib import urlopen
  9. from bs4 import BeautifulSoup
  10. import unicodedata
  11. import time
  12. import datetime
  13.  
  14.  
  15. random_wiki_article_url = "https://pl.wikipedia.org/wiki/Specjalna:Losowa_strona"
  16. language_code = "pl"
  17.  
  18. content_div_id = "mw-content-text"
  19.  
  20. timestamp = str(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H:%M:%S'))
  21. output_filename = language_code + "_" + timestamp + "_" + str(uuid.uuid1()) + ".txt"
  22. output_dir = "/home/domi/Desktop/wikiscraps/pl/"
  23. open(output_dir + output_filename, 'w').close()
  24.  
  25.  
  26.  
  27. for i in range(0, 1000):
  28.     fo = open(output_dir + output_filename, "a")
  29.  
  30.     html = urlopen(random_wiki_article_url).read()
  31.  
  32.     soup = BeautifulSoup(html, "html.parser")
  33.     content = soup.find("div", {"id": content_div_id})
  34.     text = content.getText()
  35.  
  36.     #obrobka tekstu
  37.     text = text.replace('\n', ' ').replace('\r', '')
  38.     text = text.lower()
  39.     text = text.replace("wikipedia", "")
  40.     text = text.replace("-", " ")
  41.     text = text.replace(" v t e ", "")
  42.     regex = re.compile('[%s]' % re.escape(string.punctuation))
  43.     text = regex.sub(' ', text)
  44.     text = ''.join(j for j in text if not j.isdigit())
  45.     while "  " in text:
  46.         text = text.replace("  ", " ")
  47.     print(text)
  48.     print(len(text))
  49.     utf8_text = text.encode('utf-8')
  50.     fo.write(utf8_text + "\n")
  51.     fo.close()
  52.  
  53. print("do widzenia dzieciaczki")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement