Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # ChatbotCodini.py
- """
- ChatbotCodini is intended to be a Python and JavaScript code assistant first. It is also a seemingly unique opensource community project whereas the code can be based on "selections" of online votes from every participant. Hopefully this can altogether be a very easy way for everybody to earn a whole lot of money from. Maybe to make building this project... including the database... (much more) enjoyable, it should be gamified in which we can individually score reputation points by contribution whereas each person can offer others a maximum of 1000 points each day.
- """
- import math
- import random
- import re
- def tokenize(text):
- text = re.sub(r"[^a-zA-Z0-9']", " ", text)
- tokens = text.lower().split()
- return tokens
- def attention_score(query, key):
- dot_product = sum([qi * ki for qi, ki in zip(query, key)])
- return dot_product / math.sqrt(len(query))
- def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre', truncating='pre', value=0):
- # Pads sequences to the same length.
- if not isinstance(sequences, list):
- sequences = [sequences]
- # Get the maximum length of the sequences
- lengths = [len(s) for s in sequences]
- maxlen = maxlen or max(lengths)
- # Create a list of lists filled with the value argument
- x = [[value] * maxlen for _ in range(len(sequences))]
- # Fill in the list of lists with the sequences
- for idx, seq in enumerate(sequences):
- if truncating == 'pre':
- seq = seq[-maxlen:]
- elif truncating == 'post':
- seq = seq[:maxlen]
- if padding == 'pre':
- x[idx][-len(seq):] = seq
- elif padding == 'post':
- x[idx][:len(seq)] = seq
- # Convert the list of lists to a matrix with the desired dtype
- x = [[dtype(e) for e in row] for row in x]
- return x
- def softmax(values):
- exp_values = [math.exp(v) for v in values]
- total = sum(exp_values)
- return [v / total for v in exp_values]
- def layer_norm(x, epsilon=1e-6):
- mean = sum(x) / len(x)
- var = sum([(xi - mean) ** 2 for xi in x]) / len(x)
- return [(xi - mean) / math.sqrt(var + epsilon) for xi in x]
- def dropout(x, dropout_rate=0.5):
- keep_prob = 1 - dropout_rate
- return [xi * random.uniform(0, 1) < keep_prob for xi in x]
- def relu(x):
- return [max(0, xi) for xi in x]
- def load_conversational_data(data_file):
- with open(data_file, 'r') as f:
- raw_data = f.read()
- return raw_data.split('\n')
- def save_conversational_data(data_file, raw_data):
- with open(data_file, 'w') as f:
- f.write(raw_data)
- def preprocess_conversational_data(data):
- tokenized_data = [tokenize(sentence) for sentence in data]
- return pad_sequences(tokenized_data)
- def indent(code):
- # Properly indents only code for consistency returning as a string.
- def f(code, lines, spaces):
- fixed_indents = []
- for line in lines:
- i = 0
- while line.startswith(spaces):
- line = line[len(spaces):]
- i += 1
- if line.startswith(' '):
- return False
- fixed_indents.append(('\t' * i) + line)
- return '\n'.join(fixed_indents)
- lines = code.split('\n')
- for spaces in (' ',' ',' ',' '):
- if spaces in code:
- fixed_indents = f(code, lines, spaces)
- if fixed_indents:
- return '\n'.join(fixed_indents)
- return code
- class ChatbotTransformer:
- def __init__(self, embeddings, num_heads, hidden_size, dropout_prob):
- self.embeddings = embeddings
- self.num_heads = num_heads
- self.hidden_size = hidden_size
- self.dropout_prob = dropout_prob
- self.params = self.initialize_params()
- def initialize_params(self):
- params = {}
- # Embedding layer
- params["embedding_weights"] = self.embeddings
- # Multi-head attention layer
- params["w_q"] = [[random.uniform(-0.1, 0.1) for _ in range(self.hidden_size)] for _ in range(self.hidden_size * self.num_heads)]
- params["w_k"] = [[random.uniform(-0.1, 0.1) for _ in range(self.hidden_size)] for _ in range(self.hidden_size * self.num_heads)]
- params["w_v"] = [[random.uniform(-0.1, 0.1) for _ in range(self.hidden_size)] for _ in range(self.hidden_size * self.num_heads)]
- params["w_o"] = [[random.uniform(-0.1, 0.1) for _ in range(self.hidden_size * self.num_heads)] for _ in range(self.hidden_size)]
- # Feedforward layer
- params["w_ff_1"] = [[random.uniform(-0.1, 0.1) for _ in range(self.hidden_size)] for _ in range(self.hidden_size)]
- params["w_ff_2"] = [[random.uniform(-0.1, 0.1) for _ in range(self.hidden_size)] for _ in range(self.hidden_size)]
- params["b_ff_1"] = [random.uniform(-0.1, 0.1) for _ in range(self.hidden_size)]
- params["b_ff_2"] = [random.uniform(-0.1, 0.1) for _ in range(self.hidden_size)]
- return params
- def encode(self, inputs):
- # Apply the embedding layer to the inputs
- embeddings = []
- for token in inputs:
- embeddings.append(self.params["embedding_weights"][token])
- embeddings = [[e] for e in embeddings]
- # Apply multi-head attention
- query = [[qi for qi in self.params["w_q"]] for _ in range(len(inputs))]
- key = [[ki for ki in self.params["w_k"]] for _ in range(len(inputs))]
- value = [[vi for vi in self.params["w_v"]] for _ in range(len(inputs))]
- heads = []
- q = []
- k = []
- v = []
- for i in range(self.num_heads):
- for j in range(len(inputs)):
- q += [query[j][i*self.hidden_size:(i+1)*self.hidden_size]]
- k += [key[j][i*self.hidden_size:(i+1)*self.hidden_size]]
- v += [value[j][i*self.hidden_size:(i+1)*self.hidden_size]]
- scores = []
- for j in range(len(inputs)):
- score = []
- for k_j in k:
- score.append(attention_score(query[j][i*self.hidden_size:(i+1)*self.hidden_size], k_j))
- scores.append(score)
- scores = softmax(scores)
- head = []
- for j in range(len(inputs)):
- h = [0] * self.hidden_size
- for k_j, v_j, score_jk in zip(k, v, scores[j]):
- for l in range(self.hidden_size):
- h[l] += score_jk * v_j[0][l]
- head.append(h)
- heads.append(head)
- multi_head_output = []
- for j in range(len(inputs)):
- output_j = []
- for l in range(self.hidden_size):
- o_l = 0
- for i in range(self.num_heads):
- o_l += heads[i][j][l]
- output_j.append(o_l)
- multi_head_output.append(output_j)
- # Apply layer normalization and dropout to the multi-head attention output
- output = []
- for i in range(len(inputs)):
- layer_norm_input = [multi_head_output[i][l] + self.params["w_o"][l][j] for l in range(self.hidden_size)]
- layer_norm_output = layer_norm(layer_norm_input)
- dropout_output = dropout(layer_norm_output, self.dropout_prob)
- output.append(dropout_output)
- # Apply feedforward layer
- ff_output = []
- for i in range(len(inputs)):
- hidden_layer_output = relu([self.params["w_ff_1"][l][j] * output[i][l] + self.params["b_ff_1"][l] for l in range(self.hidden_size)])
- ff_output.append([self.params["w_ff_2"][l][j] * hidden_layer_output[l] + self.params["b_ff_2"][l] for l in range(self.hidden_size)])
- # Apply layer normalization and dropout to the feedforward output
- output = []
- for i in range(len(inputs)):
- layer_norm_input = [ff_output[i][l] + output[i][l] for l in range(self.hidden_size)]
- layer_norm_output = layer_norm(layer_norm_input)
- dropout_output = dropout(layer_norm_output, self.dropout_prob)
- output.append(dropout_output)
- return output
- def generate_response(self, inputs):
- # Encode the input sequence
- encoded_inputs = self.encode(inputs)
- # Decode the output sequence
- decoded_output = []
- prev_output_token = [0] # Start with the <START> token
- while prev_output_token[0] != 1: # Stop decoding when the <END> token is generated
- # Apply the embedding layer to the previous output token
- embedding = self.params["embedding_weights"][prev_output_token[0]]
- embedding = [[embedding]]
- # Apply multi-head attention
- query = [[qi for qi in self.params["w_q"]] for _ in range(len(encoded_inputs))]
- key = [[ki for ki in self.params["w_k"]] for _ in range(1)]
- value = [[vi for vi in self.params["w_v"]] for _ in range(1)]
- heads = []
- q = []
- k = []
- v = []
- for i in range(self.num_heads):
- for j in range(1):
- q += [query[0][i*self.hidden_size:(i+1)*self.hidden_size]]
- k += [key[0][i*self.hidden_size:(i+1)*self.hidden_size]]
- v += [value[0][i*self.hidden_size:(i+1)*self.hidden_size]]
- scores = []
- for j in range(len(encoded_inputs)):
- score = []
- for k_j in k:
- score.append(attention_score(query[j][i*self.hidden_size:(i+1)*self.hidden_size], k_j))
- scores.append(score)
- scores = softmax(scores)
- head = []
- for l in range(self.hidden_size):
- h_l = 0
- for j, k_j, v_j, score_jk in zip(range(len(encoded_inputs)), k, v, scores):
- for m in range(self.hidden_size):
- h_l += score_jk[j] * v_j[0][m]
- head.append(h_l)
- heads.append(head)
- multi_head_output = []
- for l in range(self.hidden_size):
- o_l = 0
- for i in range(self.num_heads):
- o_l += heads[i][l]
- multi_head_output.append(o_l)
- # Apply layer normalization and dropout to the multi-head attention output
- output = [multi_head_output[l] + self.params["w_o"][l][j] for l in range(self.hidden_size)]
- output = [self.layer_norm(output[l], l) for l in range(self.hidden_size)]
- output = [self.dropout(output[l], self.dropout_prob) for l in range(self.hidden_size)]
- # Apply feedforward layer
- ff_input = [output[l] for l in range(self.hidden_size)]
- ff_output = []
- for j in range(self.hidden_size):
- sum = self.params["b_ff_1"][j]
- for k in range(self.hidden_size):
- sum += self.params["w_ff_1"][j][k] * ff_input[k]
- relu_output = self.relu(sum)
- sum = self.params["b_ff_2"][j]
- for k in range(self.hidden_size):
- sum += self.params["w_ff_2"][j][k] * relu_output
- ff_output.append(sum)
- # Compute the logits for the next output token and sample from them to get the next token
- logits = [self.params["embedding_weights"][i][j] for i, j in zip(range(len(self.params["embedding_weights"])), ff_output)]
- probabilities = softmax(logits)
- next_output_token = [random.choices(range(len(probabilities)), weights=probabilities)[0]]
- for _ in range(max_length):
- # Append the latest output token to the output sequence
- output_sequence.append(next_output_token[0])
- # If the latest output token is the end-of-sequence token, break the loop
- if next_output_token[0] == end_of_sequence_token:
- break
- # Update the inputs with the latest output token
- inputs.append(next_output_token[0])
- # Generate the next output token using the current model and inputs
- probabilities = self.generate_probabilities(inputs)
- next_output_token = [random.choices(range(len(probabilities)), weights=probabilities)[0]]
- # Convert the output sequence from token IDs to text
- output_text = [self.vocab.get_token_from_index(token_id) for token_id in output_sequence]
- return " ".join(output_text)
- def train(self, data_file):
- # Load the conversational data
- data = load_conversational_data(data_file)
- # Preprocess the conversational data
- tokenized_data = preprocess_conversational_data(data)
- # Train the model
- for sentence in tokenized_data:
- encoder_output = self.encode(sentence)
- # Prepare target input for decoder
- target = [START_TOKEN] + sentence + [END_TOKEN]
- # Initialize decoder hidden state
- decoder_hidden = encoder_output
- # Initialize decoder input with start token
- decoder_input = START_TOKEN
- # Initialize decoder output list
- decoder_output_list = []
- # Loop through decoder steps
- for i in range(len(target)-1):
- # Get decoder output
- decoder_output, decoder_hidden = self.decode(decoder_input, decoder_hidden)
- # Calculate loss
- loss = cross_entropy_loss(decoder_output, target[i+1])
- # Add loss to total loss
- total_loss += loss
- # Update decoder input
- decoder_input = target[i+1]
- # Add decoder output to list
- decoder_output_list.append(decoder_output)
- # Update parameters using backpropagation
- self.optimizer.zero_grad()
- total_loss.backward()
- self.optimizer.step()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement