Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- import numpy as np
- from scipy import spatial as sp
- amount_of_words = 0
- amount_of_sentences = 0
- sentences = []
- freq = {}
- text = open("2.3.1.txt", 'r')
- for line in text:
- amount_of_sentences += 1
- current = re.split('[^a-z]', line.lower())
- result = []
- for cur in current:
- if cur != '':
- if freq.get(cur) is None:
- amount_of_words += 1
- cur_list = [0 for i in range(amount_of_sentences - 1)]
- cur_list.insert(amount_of_sentences - 1, 1)
- freq[cur] = cur_list
- else:
- cur_list = freq[cur]
- length = len(cur_list)
- if length < amount_of_sentences:
- for i in range(length, amount_of_sentences - length - 1):
- cur_list.insert(i, 0)
- cur_list.insert(amount_of_sentences - 1, 1)
- else:
- cur_list[amount_of_sentences - 1] += 1
- freq[cur] = cur_list
- result.append(cur)
- sentences.append(result)
- text.close()
- matrix_freq = np.zeros((amount_of_sentences, amount_of_words))
- i = 0
- for value in freq.values():
- length = len(value)
- for j in range(length):
- matrix_freq[j, i] = value[j]
- for j in range(length, amount_of_sentences):
- matrix_freq[j, i] = 0
- i += 1
- results = []
- comp = matrix_freq[0]
- min1 = 1
- min2 = 1
- min1_index = 0
- min2_index = 0
- for i in range(1, amount_of_sentences):
- x = sp.distance.cosine(comp, matrix_freq[i])
- results.insert(i, x)
- if x <= min1:
- min1 = x
- min1_index = i
- elif x <= min2:
- min2 = x
- min2_index = i
- print(freq.keys())
- for i in matrix_freq:
- print(i)
- # print(results)
- # results.sort()
- # print(results)
- # print(min1, min2)
- # print(min1_index, min2_index)
- res = open('2.3.1.result.txt', 'w')
- res.write(str(min1_index) + ' ' + str(min2_index))
- res.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement