model.py

from http.client import OK
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import ast
from sklearn.metrics.pairwise import cosine_similarity as cs
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

df = pd.read_csv('dyslexia_embeddings_ada.csv')    # This csv stores (in order): prompt, completion, prompt-no-labels embeddings, prompt embeddings, completion embeddings
model = LinearRegression()

# Following two methods are different ways of reducing the dimensionality of the large embeddings
def reduce_dim_tsne(content_vecs):
    tsne = TSNE(n_components=3, perplexity=15, random_state=42, init='random', learning_rate=200)
    return tsne.fit_transform(content_vecs)

def reduce_dim_pca(content_vecs):
    pca = PCA(n_components=10)
    return pca.fit_transform(content_vecs)

# This vector will store the user's ratings of each paragraph
# Brian's scores of the first 25 are [3,4,4,2,2,1,1,2,3,4,5,3,3,2,5,5,3,4,3,2,3,3,2,2,2]
n = 5
user_scores = [0 for i in range(n)]
content_vecs = df['comp_embeddings']
content_vecs = [ast.literal_eval(r['comp_embeddings']) for _, r in df.iterrows()] # convert embeddings from a string containing the vector to just a vector

for i in range(n):
    print(df.completion[i])
    print("Rating:",end=" ")
    rate = input()
    user_scores[i] = int(rate)
    print()

print(f"These are the user scores for the first {n} paragrahs: {user_scores}")

content_vecs = reduce_dim_pca(content_vecs)
x = np.array(content_vecs)[:len(user_scores)]
y = np.array(user_scores)
y = np.array([[_] for _ in y])
model.fit(x,y)
trained_uservec = model.coef_   # This is the user vector derived from the linear regression
vv = np.array(trained_uservec)
cc = np.array(content_vecs)
arr = cs(cc, vv)    # cosine_similarity (weight vector, i'th content vector) where i increments from 0 to 351
arr = [a[0] for a in arr]
inds = np.argpartition(arr, -n)[-n:]  # Creates a list of the indices of the top 15 ranked paragraphs
inds = list(inds)
res = pd.DataFrame(df.iloc[inds]['completion'])
# res_embeddings = [content_vecs[i] for i in inds]
# print(res)
final_user_scores = [0 for i in range(n)]
final_predicted_scores = [float(model.predict(np.array(content_vecs[inds[i]]).reshape((1,10)))[0][0]) for i in range(n)]
print(f"We've found {n} paragarphs that we would like you to rate.")
print("")
for i in range(n):
    print(res['completion'][inds[i]])
    print("Rating:",end=" ")
    score = input()
    final_user_scores[i] = int(score)
    print()

print(f"These are the final user scores {final_user_scores}")
print(f"These are the predicted final scores {final_predicted_scores}")