text extraction

import numpy as np
import pandas as pd

# Load the property title dataset
titles = pd.read_csv('property_titles.csv')

# Preprocess the data
corpus = titles['description']
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus).toarray()
y = titles['title']

# Split the dataset into training and testing sets
np.random.seed(42)
indices = np.random.permutation(len(X))
split = int(0.8 * len(X))
train_indices, test_indices = indices[:split], indices[split:]
X_train, X_test = X[train_indices], X[test_indices]
y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

# Train a neural network on the training set
input_size = X_train.shape[1]
hidden_size = 100
output_size = len(y_train.unique())

np.random.seed(42)
W1 = np.random.randn(input_size, hidden_size)
W2 = np.random.randn(hidden_size, output_size)

def relu(x):
    return np.maximum(x, 0)

def softmax(x):
    exps = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exps / np.sum(exps, axis=1, keepdims=True)

def forward(X, W1, W2):
    Z1 = X.dot(W1)
    A1 = relu(Z1)
    Z2 = A1.dot(W2)
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

def backward(X, y, Z1, A1, Z2, A2, W2, lr=0.1):
    Y = pd.get_dummies(y).values
    dZ2 = A2 - Y
    dW2 = A1.T.dot(dZ2)
    dA1 = dZ2.dot(W2.T)
    dZ1 = dA1 * (Z1 > 0)
    dW1 = X.T.dot(dZ1)
    W1 -= lr * dW1
    W2 -= lr * dW2

epochs = 1000
for epoch in range(epochs):
    Z1, A1, Z2, A2 = forward(X_train, W1, W2)
    backward(X_train, y_train, Z1, A1, Z2, A2, W2)
    if epoch % 100 == 0:
        loss = np.sum(-np.log(A2[np.arange(len(X_train)), y_train])) / len(X_train)
        print(f'Epoch {epoch}, loss = {loss}')

# Generate a property title using the trained model
def create_title(area, location, property_type):
    description = f"{property_type} in {location} - {area} sqft"
    X_new = vectorizer.transform([description]).toarray()
    _, _, _, A2 = forward(X_new, W1, W2)
    title_idx = np.argmax(A2)
    title = y.unique()[title_idx]
    return title