Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # scrape_stackoverflow.py ZZZ rewritten for py3, but now has somewhat a faulty output
- import re
- from urllib.parse import urlparse
- import urllib.request
- import re
- import os
- import string
- import urllib
- import time
- web = "https://stackoverflow.com"
- url = web+"/questions/tagged/python?{}sort=votes&pagesize=50"
- ttt = "stackoverflow/"
- targetA = '<h3><a href="/questions/.+?python.+?href="/questions/tagged/'
- targetB = '<h3><a href="/questions/(.+?)"' # brackets capture string
- ans = '<div class="answercell.+?<p>(.+?)</div>'
- r10 = [10-z for z in range(10)]
- misc='''
- </ <
- > >
- < <
- '''.splitlines()
- br='''
- <a>
- <p>
- <blockquote>
- '''.splitlines()
- no='''
- <code>
- <li>
- <ul>
- <h1>
- <em>
- <pre>
- <strong>
- '''.splitlines()
- from tkinter import filedialog
- from tkinter import *
- root = Tk()
- file = filedialog.askopenfilename(filetypes=[("txt files","*.txt"),("all files","*.*")])
- file = file[:file.rfind('/')+1]
- if ttt not in file: file+=ttt
- logs = os.path.join(file,'checked.txt')
- pagemark = os.path.join(file,'pagemark.txt')
- f = open(pagemark, 'r')
- pg = f.read()
- f.close()
- root.destroy()
- print
- p = input('Either Just Press Return To Continue From Last Page Or...\nStart From Which Page? ')
- if not p:
- if pg:
- print([pg])
- p = pg.split('pg: ')[1]
- else:
- p = 1
- p = int(p)
- print([p])
- def write_data(f, data=''):
- NL = ''
- mode = 'a'
- if f == pagemark:
- mode = 'w'
- if data: NL='\n'
- f = open(f, mode)
- f.write(data+NL)
- f.close()
- #
- if not os.path.exists(file):
- os.mkdir(file)
- if not os.path.isfile(logs):
- write_data(logs, '')
- f = open(logs)
- checked = f.read().splitlines()
- f.close()
- print('\n'.join(checked))
- def pg(p):
- if p > 1:
- return url.format("page={}&".format(p))
- return url.format('')
- #
- L = 1
- prev = ''
- def scrape(z=0):
- n = 10
- while 1:
- try:
- return urllib.request.urlopen(z).read()
- except:
- print('*** Unable to access data, retrying in',n,'seconds...')
- t = time.time()+n
- while t > time.time(): 0
- n = min(n+10,60*12)
- #
- while 1:
- links = []
- L = str(scrape(pg(p)))
- L = L.replace('\n','')
- L = re.findall(targetA, L)
- L = '\n'.join(L)
- L = re.findall(targetB, L)
- if prev == L[-1]:
- break
- print('\n\n'.join(L))
- print
- prev = L[-1]
- links.extend(L)
- for z in links:
- if z not in checked:
- print
- L = str(scrape(web+'/questions/'+z))
- L = L.replace('\n','')
- L = re.findall(ans, L)
- L = '\n\n###@###\n'.join(L)
- if len(L) > 100:
- for i in r10:
- tabs = ' '*(3*i)
- L = L.replace(tabs,'\n'+'\t')
- for zzz in misc:
- try:
- a,b = zzz.split(' ')
- L = L.replace(a,b)
- except:
- 0
- for zzz in br:
- if zzz: L = L.replace(zzz,'\n\n')
- for zzz in no:
- if zzz: L = L.replace(zzz,'')
- for t in 'z'*10:
- L = L.replace('\n\n\n','\n\n')
- L = L.replace('\t\n','\n\n')
- L = L.replace(' \n','\n\n')
- L = L.replace('\n.','\n')
- L = L.replace('\t ','\t')
- L = L.replace(' \t','\t')
- f = z.replace('/','-')
- new_file=os.path.join(file,f+'.txt')
- write_data(new_file, L)
- checked.append(z)
- write_data(logs, z)
- print(L)
- if z == links[-1]:
- log = '{} files ***$$$$$*** pg: {}'.format(len(checked), p)
- write_data(pagemark, log)
- print(log)
- else:
- print('.',end='')
- print
- print(p)
- p+=1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement