Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sys
- if len(sys.argv) != 3: # asks for a valid input
- print(f"Usage: python dna.py data.csv sequence.txt")
- exit(1)
- ''' unlike C, you don't have to declare everything at the top. I personally prefer to declare
- things close to where they are implemented, but I'm honestly not sure what the PEP8 standard is.
- I've adjusted things to fit my style, FWIW. '''
- # defines all the variables
- ''' document_file = sys.argv[1]
- dna_file = sys.argv[2]
- text = open(dna_file)
- people = open(document_file) '''
- ''' "open(file)" is almost always a sniff, instead use a context manager
- https://www.youtube.com/watch?v=C-gEQdGVXbk&feature=emb_logo&t=4m25s '''
- with open(sys.argv[2], newline='') as f:
- text = f.read()
- with open(sys.argv[1], newline='') as f:
- people = f.read()
- ''' these are unused
- temp_list = []
- list_of_people = [] '''
- ''' dna_list = []
- for line in people: # adds all of the sentences into a list
- dna_list.append(line) '''
- ''' This is a list comprehension, one of the handier tools in python
- https://www.pythonforbeginners.com/basics/list-comprehensions-in-python '''
- dna_list = [line for line in people]
- ''' letters = []
- for words in text: # makes a list of all the dna letters in the letters file
- for letter in words:
- letters.append(letter) '''
- ''' list comprehensions are normally more readable, but I'm not sure that is true when nesting like this.
- I decided to leave it more for exposure than suggesting it's best practices.
- I found the syntax here:
- https://www.geeksforgeeks.org/nested-list-comprehensions-in-python/ '''
- letters = [letter for word in text for letter in word]
- ''' for i in range(len(dna_list)): # makes the dna_list into a string
- peoples_dna += dna_list[i]
- peoples_dna = peoples_dna.strip() '''
- ''' peoples_dna = ''
- for _, val in enumerate(dna_list): # makes the dna_list into a string
- peoples_dna += val
- peoples_dna = peoples_dna.strip() '''
- ''' .join method - https://stackoverflow.com/questions/493819/why-is-it-string-joinlist-instead-of-list-joinstring '''
- peoples_dna = ''.join([val for _, val in enumerate(dna_list)]).strip()
- ''' Don't overwrite reserved keywords like 'list', 'dict', 'str', it can have bad consequences
- list = [] '''
- '''lst = []
- for sentence in peoples_dna.splitlines(): # makes a double list where there is few list in a list
- lst.append(sentence.split(',')) '''
- lst = [sentence.split(',') for sentence in peoples_dna.splitlines()]
- def get_repeat_list(letters=[], lst=[]):
- dna_repeated_list = []
- for m in range(len(lst[0]) - 1):
- ''' by putting total_appear_time_list here you automatically reset it on each loop, meaning you don't
- have to do it manually at the end. The same applies for total_times '''
- total_appear_time_list = []
- m = m + 1
- # checks the number of times something appeared in the list
- for i in range(len(letters) - 3):
- total_appear_times = 0
- word = "" # makes it go be nothing
- try: # if it gets to the end of the dna line it will make a list of range not found
- ''' for a in range(len(lst[0][m])):
- word += letters[a + i] '''
- for idx, _ in enumerate(lst[0][m]):
- word += letters[idx + i]
- except:
- break
- while word == lst[0][m]: # does it until the word is not the dna part
- total_appear_times += 1
- word = ''
- try:
- i = i + len(lst[0][m])
- for a in range(len(lst[0][m])):
- word += letters[a + i]
- except: # remember to check if there is a error 'list index out of range'
- break # exits the loop becuase it is at the end of the dna line
- # makes the number of repetion into a list
- total_appear_time_list.append(total_appear_times)
- ''' this sort is costing you a lot, and is unnecessary because of the max() method for lists, see below
- # sorts it from largest to lowest
- sorted_list = sorted(total_appear_time_list, reverse=True) '''
- # makes the count back to zero for the amount of words in the dna file
- '''total_appear_times = 0 '''
- # makes another list to store the biggest number of times the dna string has appeared
- ''' dna_repeated_list.append(sorted_list[0]) '''
- dna_repeated_list.append(max(total_appear_time_list))
- # it sets the list of all the times the dna pattern has been found
- ''' total_appear_time_list = [] '''
- return dna_repeated_list
- dna_repeated_list = get_repeat_list(letters=letters, lst=lst)
- def get_match(lst=[], dna_repeated_list=[]):
- # loop through the dna file with the people on it checking all the verticle rows except the first
- for m in range(len(lst) - 1):
- adding = 0
- m = m + 1
- # loop through the dna file with the people on it to checking the numbers but doesn't check the names
- for q in range(len(lst[1]) - 1):
- q = q + 1 # makes the variable q start at 1
- lst[m][q] = int(lst[m][q]) # makes it an int to compare it
- # compares the double list to the singular list with all the maximumun number of times the pattern appears in a row
- if lst[m][q] != dna_repeated_list[q - 1]:
- adding += 1 # if it does not match then add it by one
- if adding == 0: # if it all matches then print the name of the person
- print(f"{lst[m][0]}")
- exit(1)
- # adding = 0 # makes the variable adding go back to zero
- print(f"No match") # if it has not been found
- get_match(lst=lst, dna_repeated_list=dna_repeated_list)
Add Comment
Please, Sign In to add comment