Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # find_nearest.py
- import math
- def levenshtein_distance(s1, s2):
- if len(s1) < len(s2):
- return levenshtein_distance(s2, s1)
- # len(s1) >= len(s2)
- if len(s2) == 0:
- return len(s1)
- # create two lists a and b for current and previous row
- a = list(range(len(s2) + 1))
- b = [0] * (len(s2) + 1)
- for i, c1 in enumerate(s1):
- b[0] = i + 1
- for j, c2 in enumerate(s2):
- cost = 0 if c1 == c2 else 1
- b[j + 1] = min(b[j] + 1, a[j + 1] + 1, a[j] + cost)
- a, b = b, a
- return a[-1]
- def predict(test_set, training_set):
- predicted = len(test_set)
- for x1 in test_set:
- distance_from_x1 = []
- for x2 in training_set:
- distance = levenshtein_distance(x1, x2)
- distance_from_x1.append((distance, x2))
- sorted_distances = sorted(distance_from_x1)
- guess = sorted_distances[0][1]
- print(f"\nAltered Test: {x1}")
- match_percentage = round((1 - sorted_distances[0][0] / max(len(x1), len(guess))) * 100, 6)
- print(f"\tGuessing Nearest Match: {guess} at {match_percentage}%")
- if answers[x1] == guess:
- print("\t+++ CORRECT +++")
- else:
- print(f"\t\tExpected Answer: {answers[x1]}")
- predicted = predicted - 1
- print(f"\n\nPredicted: {predicted} out of {len(test_set)}")
- # for testing
- import random
- import string
- letters = list(string.ascii_letters)
- L = len(letters)
- tests = []
- def generate_tests(num_tests):
- i = 0
- for _ in range(num_tests):
- test = ''
- for _ in range(25):
- s = letters.pop(i%11)
- letters.append(s)
- i += 1
- test += s
- tests.append(test)
- s = letters.pop(i%7)
- letters.append(s)
- def alter_test(test):
- pos = random.sample(range(25), 12) # choose two positions to change
- new_test = list(test)
- new_test[pos[0]] = letters[(letters.index(new_test[pos[0]]) + random.randint(1, L - 1)) % L]
- new_test[pos[1]] = letters[(letters.index(new_test[pos[1]]) + random.randint(1, L - 1)) % L]
- return ''.join(new_test)
- # generate a list of n random n-letter tests
- generate_tests(1000)
- # make a copy of each test with two letters changed
- altered_tests = [alter_test(test) for test in tests]
- answers = dict(zip(altered_tests, tests))
- random.shuffle(tests)
- # predict nearest matches for each altered test
- predict(altered_tests, tests)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement