Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import sys
- import uuid
- import string
- import re
- from urllib import urlopen
- from bs4 import BeautifulSoup
- import unicodedata
- import time
- import datetime
- import os
- random_wiki_article_url = "https://pl.wikipedia.org/wiki/Specjalna:Losowa_strona"
- language_code = "pl"
- if len(sys.argv) == 3:
- language_code = sys.argv[1]
- random_wiki_article_url = sys.argv[2]
- content_div_id = "mw-content-text"
- myrange = 100
- timestamp = str(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H:%M:%S'))
- output_filename = language_code + "_" + timestamp + "_" + str(uuid.uuid1()) + "_" + str(myrange) + ".txt"
- output_dir = "/home/domi/Desktop/wikiscraps/" + language_code + "/"
- if not os.path.exists(output_dir):
- os.makedirs(output_dir)
- open(output_dir + output_filename, 'w').close()
- for i in range(0, myrange):
- fo = open(output_dir + output_filename, "a")
- html = urlopen(random_wiki_article_url).read()
- soup = BeautifulSoup(html, "html.parser")
- content = soup.find("div", {"id": content_div_id})
- text = content.getText()
- #obrobka tekstu
- text = text.replace('\n', ' ').replace('\r', '')
- text = text.lower()
- text = text.replace("wikipedia", "")
- text = text.replace("-", " ")
- text = text.replace(" v t e ", "")
- regex = re.compile('[%s]' % re.escape(string.punctuation))
- text = regex.sub(' ', text)
- text = ''.join(j for j in text if not j.isdigit())
- while " " in text:
- text = text.replace(" ", " ")
- utf8_text = text.encode('utf-8')
- print(len(text))
- print (utf8_text)
- fo.write(utf8_text + "\n")
- fo.close()
- print("scrapping ended for language: " + language_code)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement