Advertisement
TShiva

Graphs

Jul 21st, 2016
252
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.09 KB | None | 0 0
  1. import requests
  2. import re
  3. import json
  4.  
  5. check_sites = set()
  6. graph = {}
  7. accepted = []
  8. rejected = []
  9. node=0
  10.  
  11. def is_html_text_page(url):
  12.     if url in accepted:
  13.         return True
  14.     if url in rejected:
  15.         return False
  16.  
  17.     if url.find('.css') != -1 or url.find('.png') != -1 or url.find('.xml') != -1 or url.find('.jpg') != -1:
  18.         rejected.append(url)
  19.         return False
  20.  
  21.     r = requests.head(url)
  22.  
  23.     if r.status_code != 200:
  24.          rejected.append(url)
  25.          return False
  26.  
  27.     is_ok = r.headers["content-type"].startswith("text/html")
  28.     if is_ok==True:
  29.         accepted.append(url)
  30.         return True
  31.     else:
  32.         rejected.append(url)
  33.         return False
  34.  
  35.  
  36.  
  37. def site_parser(sites):
  38.     norm_sites = set()
  39.     for line in sites:
  40.         if line.find('www.gehtsoftusa.com') != -1:
  41.             if is_html_text_page(line):
  42.                 if line[-1] == '/':
  43.                     line = line[:-1]
  44.                     norm_sites.add(line)
  45.                 print(line + " is good html doc, process...")
  46.             else:
  47.                 print(line + " is bad html doc, skip...")
  48.     return norm_sites
  49.  
  50. def get_sites(r):
  51.     sites = []
  52.     response = requests.get(r)
  53.     sites = re.findall(r'href=[\'"]?([^\'" >]+)', response.text)#[^#]
  54.     normal_sites = list(site_parser(sites))
  55.     return normal_sites
  56.  
  57. def graph_creater(url):
  58.     global graph
  59.     global node
  60.     print(node)
  61.     node += 1
  62.     if url not in graph:
  63.         cur_sites = get_sites(url)
  64.         graph[url] = cur_sites
  65.         for site in cur_sites:
  66.             graph_creater(site)
  67.  
  68. def json_create(dictionary):
  69.     global graph
  70.     jsonfile=open('graf.json','w')
  71.     json.dump(dictionary,jsonfile)
  72.     jsonfile.close()
  73.  
  74. def from_jsom_to_graph():
  75.     jsonfile = open('graf.json', 'r')
  76.     new_graph=json.load(jsonfile)
  77.     jsonfile.close()
  78.     return new_graph
  79.  
  80. def d_graph():
  81.     graph = from_jsom_to_graph()
  82.     new_graph={}
  83.     keys=list(graph.keys())
  84.     for key in keys:
  85.         value = graph[key]
  86.         new_graph[key] = []
  87.         for v in value:
  88.             new_graph[key].append([v, 10000])
  89.     return new_graph
  90.  
  91.  
  92.  
  93. def dejkstra(url,url2):
  94.     if url1 == url2:
  95.         return 0
  96.     graph=d_graph()
  97.     first_lvl=set()
  98.     second_lvl = set()
  99.     mark = set()
  100.     distance=0
  101.     keys = list(graph.keys())
  102.     first_lvl.add(url1)
  103.     path={}
  104.     while len(mark) != len(graph):
  105.         distance+=1
  106.         for key in (set(keys)&first_lvl):
  107.             value = graph[key]
  108.             if (key in first_lvl) and key not in mark:
  109.                 for v in value:
  110.                     if distance<v[1]:
  111.                         v[1]=distance
  112.                         if v[0] not in first_lvl:
  113.                             second_lvl.add(v[0])
  114.                         path[key] = v[0]
  115.                 mark.add(key)
  116.  
  117.         if url2 in first_lvl:
  118.             node = url2
  119.             for i in range(1, distance):
  120.                 print(node)
  121.                 node = path[node]
  122.             print(node)
  123.             print(url)
  124.             return distance
  125.         first_lvl.clear()
  126.  
  127.         distance+=1
  128.         for key in (set(keys)&second_lvl):
  129.             value = graph[key]
  130.             if (key in second_lvl) and (key not in mark):
  131.                 for v in value:
  132.                     if distance < v[1]:
  133.                         v[1] = distance
  134.                         if v[0] not in second_lvl:
  135.                             first_lvl.add(v[0])
  136.                         path[key] = v[0]
  137.                 mark.add(key)
  138.  
  139.         if url2 in second_lvl:
  140.             node=url2
  141.             for i in range(1,distance):
  142.                 print(node)
  143.                 node=path[node]
  144.             print(node)
  145.             print(url)
  146.             return distance
  147.         second_lvl.clear()
  148.  
  149. #r = 'http://www.gehtsoftusa.com'
  150. #graph_creater(r)
  151. #json_create(graph)
  152. graph=from_jsom_to_graph()
  153. url1='http://www.gehtsoftusa.com/premier-resource-software-development'
  154. url2='http://www.gehtsoftusa.com/gehtsoft-usa-llc-attends-midmarket-cio-forum-april-6-8-2014'
  155. print(dejkstra(url1,url2))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement