Advertisement
here2share

# scrape_cpp_stackoverflow.py

Jul 8th, 2021
1,208
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.55 KB | None | 0 0
  1. # scrape_cpp_stackoverflow.py ZZZ rewritten for py3, but now has somewhat a faulty output
  2.  
  3. # to gather C++ content
  4.  
  5. import re
  6. import os
  7. from urllib.request import urlopen
  8. import time
  9.  
  10. # import tkFileDialog
  11. # from Tkinter import *
  12.  
  13. from tkinter import filedialog
  14. from tkinter import *
  15.  
  16. misc='''
  17. </ <
  18. &gt; >
  19. &lt; <
  20. '''.splitlines()
  21. br='''
  22. <a>
  23. <p>
  24. <blockquote>
  25. '''.splitlines()
  26. no='''
  27. <code>
  28. <li>
  29. <ul>
  30. <h1>
  31. <em>
  32. <pre>
  33. <strong>
  34. '''.splitlines()
  35.  
  36. def write_data(f, data=''):
  37.     NL = ''
  38.     mode = 'a'
  39.     if f == pagemark:
  40.         mode = 'w'
  41.     if data: NL='\n'
  42.     f = open(f, mode)
  43.     f.write(data+NL)
  44.     f.close()
  45. #
  46.  
  47. web = "https://stackoverflow.com"
  48. url = web+"/questions/tagged/cpp?{}sort=votes&pagesize=50"
  49. ttt = "stackoverflow/"
  50.  
  51. targetA = 'questions/'
  52. targetB = '" class=' # brackets capture string
  53.  
  54. ans = '<div class="answercell.+?<p>(.+?)</div>'
  55.  
  56. r10 = [10-z for z in range(10)]
  57.  
  58. root = Tk()
  59. file = filedialog.askopenfilename(filetypes=[("txt files","*.txt"),("all files","*.*")])
  60. file = file[:file.rfind('/')+1]
  61. if ttt not in file: file+=ttt
  62.  
  63. logs = os.path.join(file,'cpp_checked.txt')
  64. pagemark = os.path.join(file,'cpp_pagemark.txt')
  65.  
  66. if not os.path.exists(file):
  67.     os.mkdir(file)
  68. if not os.path.isfile(logs):
  69.     write_data(logs, '')
  70. if not os.path.isfile(pagemark):
  71.     write_data(pagemark, '')
  72. #
  73.  
  74. root.destroy()
  75.  
  76. misc='''
  77. </ <
  78. &gt; >
  79. &lt; <
  80. '''.splitlines()
  81. br='''
  82. <a>
  83. <p>
  84. <blockquote>
  85. '''.splitlines()
  86. no='''
  87. <code>
  88. <li>
  89. <ul>
  90. <h1>
  91. <pre>
  92. <strong>
  93. '''.splitlines()
  94.  
  95. f = open(logs)
  96. checked = f.read().splitlines()
  97. f.close()
  98. f = open(pagemark)
  99. mark = f.read().splitlines()
  100. f.close()
  101. print(mark)
  102. p = input('Start From Which Page? ')
  103. if not p: p = 1
  104. p = int(p)
  105.  
  106. def pg(p):
  107.     if p > 1:
  108.         return url.format("page={}&".format(p))
  109.     return url.format('')
  110.  
  111. #
  112. prev = ''
  113.  
  114. def scrape(z=0):
  115.     c = 5
  116.     while 1:
  117.         try:
  118.             return str(urlopen(z).read())
  119.         except:
  120.             print('*** Unable to access data, retrying...')
  121.             print(c)
  122.             t = time.time()+c
  123.             while t > time.time(): 0
  124.         c += 5
  125. #
  126.  
  127. while 1:
  128.     links = []
  129.     L = scrape(pg(p))
  130.     L = L.replace('questions/tagged/','')
  131.     L = L.replace(targetA,'@+++@<!!!>')
  132.     L = L.replace(targetB,'@+++@')
  133.     L = L.split('@+++@')
  134.     L = [s[5:] for s in L if '<!!!>' in s][1:]
  135.     if prev == L[-1]:
  136.         break
  137.     print('\n\n'.join(L))
  138.     print()
  139.     prev = L[-1]
  140.     links.extend(L)
  141.     for z in links:
  142.         if z not in checked:
  143.             print()
  144.             L = scrape(web+'/questions/'+z)
  145.             L = L.replace('\n','')
  146.             L = re.findall(ans, L)
  147.             L = '\n\n###@###\n'.join(L)
  148.             if len(L) > 100:
  149.                 for i in r10:
  150.                     tabs = ' '*(3*i)
  151.                     L = L.replace(tabs,'\n'+'\t')
  152.                 for zzz in misc:
  153.                         try:
  154.                             a,b = zzz.split(' ')
  155.                             L = L.replace(a,b)
  156.                         except:
  157.                             0
  158.                 for zzz in br:
  159.                     if zzz: L = L.replace(zzz,'\n\n')
  160.                 for zzz in no:
  161.                     if zzz: L = L.replace(zzz,'')
  162.                 for t in 'z'*10:
  163.                     L = L.replace('\n\n\n','\n\n')
  164.                     L = L.replace('\t\n','\n\n')
  165.                     L = L.replace(' \n','\n\n')
  166.                     L = L.replace('\n.','\n')
  167.                     L = L.replace('\t ','\t')
  168.                     L = L.replace(' \t','\t')
  169.                 f = z.replace('/','-')
  170.                 new_file=os.path.join(file,f+'.txt')
  171.                 write_data(new_file, L)
  172.                 checked.append(z)
  173.                 write_data(logs, z)
  174.                 print(L)
  175.                 if z == links[-1]:
  176.                     log = '{} files ***$$$$$*** pg: {}'.format(len(checked), p)
  177.                     write_data(pagemark, log)
  178.                     print(log)
  179.                     print([file])
  180.                     print()
  181.         else:
  182.             print('.',end='')
  183.  
  184.     print()
  185.     print(p)
  186.     p+=1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement