Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # gzip_find_nearest.py
- import gzip
- def predict(test_set, training_set):
- predicted = len(test_set)
- for x1 in test_set:
- distance_from_x1 = []
- for x2 in training_set:
- Cx2 = len(gzip.compress(x2.encode()))
- x1x2 = " ".join([x1, x2])
- Cx1x2 = len(gzip.compress(x1x2.encode()))
- ncd = (Cx1x2 - min(len(x1.encode()), Cx2)) / max(len(x1.encode()), Cx2)
- distance_from_x1.append((ncd, x2))
- sorted_distances = sorted(distance_from_x1)
- guess = sorted_distances[0][1]
- print(f"\nAltered Test: {x1}")
- match_percentage = round((100-sorted_distances[0][0]),6)
- print(f"\tGuessing Nearest Match: {guess} at {match_percentage}%")
- if answers[x1] == guess:
- print("\t+++ CORRECT +++")
- else:
- print(f"\t\tExpected Answer: {answers[x1]}")
- predicted = predicted - 1
- print(f"\n\nPredicted: {predicted} out of {len(test_set)}")
- # for testing
- import random
- import string
- letters = list(string.ascii_letters)
- L = len(letters)
- tests = []
- def generate_tests(num_tests):
- i = 0
- for _ in range(num_tests):
- test = ''
- for _ in range(9):
- s = letters.pop(i%11)
- letters.append(s)
- i += 1
- test += s
- tests.append(test)
- s = letters.pop(i%7)
- letters.append(s)
- def alter_test(test):
- pos = random.sample(range(9), 2) # choose two positions to change
- new_test = list(test)
- new_test[pos[0]] = letters[(letters.index(new_test[pos[0]]) + random.randint(1, L - 1)) % L]
- new_test[pos[1]] = letters[(letters.index(new_test[pos[1]]) + random.randint(1, L - 1)) % L]
- return ''.join(new_test)
- # generate a list of n random n-letter tests
- generate_tests(1000)
- # make a copy of each test with two letters changed
- altered_tests = [alter_test(test) for test in tests]
- answers = dict(zip(altered_tests, tests))
- random.shuffle(tests)
- # predict nearest matches for each altered test
- predict(altered_tests, tests)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement