Advertisement
Korotkodul

web crauler

Oct 20th, 2022 (edited)
834
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 3.45 KB | None | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3.  
  4. #link = "https://edition.cnn.com/"
  5. goto =    "https://www.povarenok.ru/recipes/kitchen/77/"
  6. #link = "https://www.povarenok.ru/recipes/show/70127/"
  7.  
  8. pages = [''] * 13
  9. pages[0] = goto
  10.  
  11. s = 'https://www.povarenok.ru/recipes/kitchen/77/~2/'
  12. pages[1] = s
  13.  
  14.  
  15.  
  16. for i in range(2, 13):
  17.     s = pages[i - 1]
  18.     s = goto + '~' + str(i + 1) + '/'
  19.     pages[i] = s
  20. print(pages)
  21. """в конце изменить goto:
  22.  
  23. """
  24. #for i in range(13):
  25.  
  26. names = []
  27. recep = []
  28. photo = []
  29. #<div class="ingredients-bl">
  30. debug = False
  31.  
  32. def ingrid(goto):
  33.     #print("goto = ", goto)
  34.     html = requests.get(goto)
  35.     html.encoding = 'windows-1251'
  36.     sp = BeautifulSoup(html.text, 'lxml')
  37.     ing = sp.find('div', "ingredients-bl")
  38.     all = ing.find_all('span')
  39.     #print("all")
  40.     #print(all)
  41.     ing_list = []
  42.  
  43.     for i in range(0, len(all), 2):
  44.         if i + 1 >= len(all):
  45.             break
  46.         one_ing = all[i]
  47.         one_ing = str(one_ing)
  48.         since = one_ing.find('n>') + 2
  49.         till = one_ing.find('</')
  50.         ing_str = one_ing[since: till]
  51.         amount = all[i + 1]
  52.         amount = str(amount)
  53.         since = amount.find('n>') + 2
  54.         till = amount.find('</')
  55.         amount_str = amount[since: till]
  56.         res = ing_str + "  " + amount_str
  57.         ing_list.append(res)
  58.         #print(one_ing, amount)
  59.     #print("ing list")
  60.     #print(ing_list)
  61.     recep.append(ing_list)
  62.     #print(ing)
  63.  
  64. def work(goto):
  65.     print("WORK")
  66.     html = requests.get(goto)
  67.     html.encoding = 'windows-1251'
  68.     sp = BeautifulSoup(html.text, 'lxml')
  69.     all = sp.find_all('div', "m-img desktop-img conima")
  70.     #print(all)
  71.     for rec in all:
  72.         if debug:
  73.             print("NEW REC")
  74.             print(rec)
  75.         #print(rec)
  76.         raw = rec.find('img')
  77.         #отсюда название и  ссылка на картинку
  78.         #print("IMAGE")
  79.         #print(img)
  80.         raw = str(raw)
  81.         since = raw.find('Ре')
  82.         till = raw.find('src')
  83.         name = raw[since: till]
  84.         names.append(name)
  85.  
  86.         since = raw.find('htt')
  87.         till = raw.find('/>') - 1
  88.         photo_link = raw[since: till]
  89.         photo.append(photo_link)
  90.         #КАК получить список ингридиентов???
  91.         link = rec.find('a')
  92.         if debug:
  93.             print("link")
  94.             print(link)
  95.             print("end link")
  96.         link = str(link)
  97.         till = link.find(">")
  98.         since = link.find("htt")
  99.         link = link[since: till - 1]
  100.         ingrid(link)
  101.         if debug:
  102.             print("new link")
  103.             print(link)
  104.         #print()
  105.         #print()
  106.         #break
  107.  
  108.  
  109.  
  110.  
  111. for i in range(13):
  112.     work(pages[i])
  113.  
  114. print("photo", len(photo))
  115. print(photo)
  116. print("recep", len(recep))
  117. print(recep)
  118. print("names", len(names))
  119. print(names)
  120.  
  121. import lxml
  122. from xlwt import *
  123. workbook = Workbook(encoding = 'utf-8')
  124. table = workbook.add_sheet('data')
  125. table.write(0, 0, 'Название')
  126. table.write(0, 1, 'Рецепт')
  127. table.write(0, 2, 'Фото')
  128.  
  129. N = len(recep)
  130. line = 0
  131. for i in range(N):
  132.     line += 1
  133.     table.write(line, 0, names[i])
  134.     table.write(line, 2, photo[i])
  135.     table.write(line, 1, 'ингридиенты')
  136.     for j in range(len(recep[i])):
  137.         line += 1
  138.         table.write(line, 1, recep[i][j])
  139.     line += 1
  140.  
  141. workbook.save('recep4.xls')
  142. print("FILE SAVED")
  143.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement