Advertisement
here2share

# scrape_stackoverflow.py

Nov 16th, 2020
1,440
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.96 KB | None | 0 0
  1. # scrape_stackoverflow.py ZZZ rewritten for py3, but now has somewhat a faulty output
  2.  
  3. import re    
  4. from urllib.parse import urlparse    
  5. import urllib.request
  6.  
  7. import re
  8. import os
  9. import string
  10. import urllib
  11. import time
  12.  
  13. web = "https://stackoverflow.com"
  14. url = web+"/questions/tagged/python?{}sort=votes&pagesize=50"
  15. ttt = "stackoverflow/"
  16.  
  17. targetA = '<h3><a href="/questions/.+?python.+?href="/questions/tagged/'
  18. targetB = '<h3><a href="/questions/(.+?)"' # brackets capture string
  19.  
  20. ans = '<div class="answercell.+?<p>(.+?)</div>'
  21.  
  22. r10 = [10-z for z in range(10)]
  23.  
  24. misc='''
  25. </ <
  26. &gt; >
  27. &lt; <
  28. '''.splitlines()
  29. br='''
  30. <a>
  31. <p>
  32. <blockquote>
  33. '''.splitlines()
  34. no='''
  35. <code>
  36. <li>
  37. <ul>
  38. <h1>
  39. <em>
  40. <pre>
  41. <strong>
  42. '''.splitlines()
  43.  
  44. from tkinter import filedialog
  45. from tkinter import *
  46.  
  47. root = Tk()
  48. file = filedialog.askopenfilename(filetypes=[("txt files","*.txt"),("all files","*.*")])
  49. file = file[:file.rfind('/')+1]
  50. if ttt not in file: file+=ttt
  51. logs = os.path.join(file,'checked.txt')
  52. pagemark = os.path.join(file,'pagemark.txt')
  53. f = open(pagemark, 'r')
  54. pg = f.read()
  55. f.close()
  56.  
  57. root.destroy()
  58.  
  59. print
  60. p = input('Either Just Press Return To Continue From Last Page Or...\nStart From Which Page?  ')
  61. if not p:
  62.     if pg:
  63.         print([pg])
  64.         p = pg.split('pg: ')[1]
  65.     else:
  66.         p = 1
  67. p = int(p)
  68. print([p])
  69.  
  70. def write_data(f, data=''):
  71.     NL = ''
  72.     mode = 'a'
  73.     if f == pagemark:
  74.         mode = 'w'
  75.     if data: NL='\n'
  76.     f = open(f, mode)
  77.     f.write(data+NL)
  78.     f.close()
  79. #
  80. if not os.path.exists(file):
  81.     os.mkdir(file)
  82. if not os.path.isfile(logs):
  83.     write_data(logs, '')
  84.  
  85. f = open(logs)
  86. checked = f.read().splitlines()
  87. f.close()
  88. print('\n'.join(checked))
  89.  
  90. def pg(p):
  91.     if p > 1:
  92.         return url.format("page={}&".format(p))
  93.     return url.format('')
  94.        
  95. #
  96. L = 1
  97. prev = ''
  98.  
  99. def scrape(z=0):
  100.     n = 10
  101.     while 1:
  102.         try:
  103.             return urllib.request.urlopen(z).read()
  104.         except:
  105.             print('*** Unable to access data, retrying in',n,'seconds...')
  106.             t = time.time()+n
  107.             while t > time.time(): 0
  108.             n = min(n+10,60*12)
  109. #
  110.  
  111. while 1:
  112.     links = []
  113.     L = str(scrape(pg(p)))
  114.     L = L.replace('\n','')
  115.     L = re.findall(targetA, L)
  116.     L = '\n'.join(L)
  117.     L = re.findall(targetB, L)
  118.     if prev == L[-1]:
  119.         break
  120.     print('\n\n'.join(L))
  121.     print
  122.     prev = L[-1]
  123.     links.extend(L)
  124.     for z in links:
  125.         if z not in checked:
  126.             print
  127.             L = str(scrape(web+'/questions/'+z))
  128.             L = L.replace('\n','')
  129.             L = re.findall(ans, L)
  130.             L = '\n\n###@###\n'.join(L)
  131.             if len(L) > 100:
  132.                 for i in r10:
  133.                     tabs = ' '*(3*i)
  134.                     L = L.replace(tabs,'\n'+'\t')
  135.                 for zzz in misc:
  136.                         try:
  137.                             a,b = zzz.split(' ')
  138.                             L = L.replace(a,b)
  139.                         except:
  140.                             0
  141.                 for zzz in br:
  142.                     if zzz: L = L.replace(zzz,'\n\n')
  143.                 for zzz in no:
  144.                     if zzz: L = L.replace(zzz,'')
  145.                 for t in 'z'*10:
  146.                     L = L.replace('\n\n\n','\n\n')
  147.                     L = L.replace('\t\n','\n\n')
  148.                     L = L.replace(' \n','\n\n')
  149.                     L = L.replace('\n.','\n')
  150.                     L = L.replace('\t ','\t')
  151.                     L = L.replace(' \t','\t')
  152.                 f = z.replace('/','-')
  153.                 new_file=os.path.join(file,f+'.txt')
  154.                 write_data(new_file, L)
  155.                 checked.append(z)
  156.                 write_data(logs, z)
  157.                 print(L)
  158.                 if z == links[-1]:
  159.                     log = '{} files ***$$$$$*** pg: {}'.format(len(checked), p)
  160.                     write_data(pagemark, log)
  161.                     print(log)
  162.         else:
  163.             print('.',end='')
  164.                
  165.     print
  166.     print(p)
  167.     p+=1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement