Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- import re
- import json
- check_sites = set()
- graph = {}
- accepted = []
- rejected = []
- node=0
- def is_html_text_page(url):
- if url in accepted:
- return True
- if url in rejected:
- return False
- if url.find('.css') != -1 or url.find('.png') != -1 or url.find('.xml') != -1 or url.find('.jpg') != -1:
- rejected.append(url)
- return False
- r = requests.head(url)
- if r.status_code != 200:
- rejected.append(url)
- return False
- is_ok = r.headers["content-type"].startswith("text/html")
- if is_ok==True:
- accepted.append(url)
- return True
- else:
- rejected.append(url)
- return False
- def site_parser(sites):
- norm_sites = set()
- for line in sites:
- if line.find('www.gehtsoftusa.com') != -1:
- if is_html_text_page(line):
- if line[-1] == '/':
- line = line[:-1]
- norm_sites.add(line)
- print(line + " is good html doc, process...")
- else:
- print(line + " is bad html doc, skip...")
- return norm_sites
- def get_sites(r):
- sites = []
- response = requests.get(r)
- sites = re.findall(r'href=[\'"]?([^\'" >]+)', response.text)#[^#]
- normal_sites = list(site_parser(sites))
- return normal_sites
- def graph_creater(url):
- global graph
- global node
- print(node)
- node += 1
- if url not in graph:
- cur_sites = get_sites(url)
- graph[url] = cur_sites
- for site in cur_sites:
- graph_creater(site)
- def json_create(dictionary):
- global graph
- jsonfile=open('graf.json','w')
- json.dump(dictionary,jsonfile)
- jsonfile.close()
- def from_jsom_to_graph():
- jsonfile = open('graf.json', 'r')
- new_graph=json.load(jsonfile)
- jsonfile.close()
- return new_graph
- def d_graph():
- graph = from_jsom_to_graph()
- new_graph={}
- keys=list(graph.keys())
- for key in keys:
- value = graph[key]
- new_graph[key] = []
- for v in value:
- new_graph[key].append([v, 10000])
- return new_graph
- def dejkstra(url,url2):
- if url1 == url2:
- return 0
- graph=d_graph()
- first_lvl=set()
- second_lvl = set()
- mark = set()
- distance=0
- keys = list(graph.keys())
- first_lvl.add(url1)
- path={}
- while len(mark) != len(graph):
- distance+=1
- for key in (set(keys)&first_lvl):
- value = graph[key]
- if (key in first_lvl) and key not in mark:
- for v in value:
- if distance<v[1]:
- v[1]=distance
- if v[0] not in first_lvl:
- second_lvl.add(v[0])
- path[key] = v[0]
- mark.add(key)
- if url2 in first_lvl:
- node = url2
- for i in range(1, distance):
- print(node)
- node = path[node]
- print(node)
- print(url)
- return distance
- first_lvl.clear()
- distance+=1
- for key in (set(keys)&second_lvl):
- value = graph[key]
- if (key in second_lvl) and (key not in mark):
- for v in value:
- if distance < v[1]:
- v[1] = distance
- if v[0] not in second_lvl:
- first_lvl.add(v[0])
- path[key] = v[0]
- mark.add(key)
- if url2 in second_lvl:
- node=url2
- for i in range(1,distance):
- print(node)
- node=path[node]
- print(node)
- print(url)
- return distance
- second_lvl.clear()
- #r = 'http://www.gehtsoftusa.com'
- #graph_creater(r)
- #json_create(graph)
- graph=from_jsom_to_graph()
- url1='http://www.gehtsoftusa.com/premier-resource-software-development'
- url2='http://www.gehtsoftusa.com/gehtsoft-usa-llc-attends-midmarket-cio-forum-april-6-8-2014'
- print(dejkstra(url1,url2))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement