Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
from http.client import OK
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import ast
from sklearn.metrics.pairwise import cosine_similarity as cs
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
df = pd.read_csv('dyslexia_embeddings_ada.csv') # This csv stores (in order): prompt, completion, prompt-no-labels embeddings, prompt embeddings, completion embeddings
model = LinearRegression()
# Following two methods are different ways of reducing the dimensionality of the large embeddings
def reduce_dim_tsne(content_vecs):
tsne = TSNE(n_components=3, perplexity=15, random_state=42, init='random', learning_rate=200)
return tsne.fit_transform(content_vecs)
def reduce_dim_pca(content_vecs):
pca = PCA(n_components=10)
return pca.fit_transform(content_vecs)
# This vector will store the user's ratings of each paragraph
# Brian's scores of the first 25 are [3,4,4,2,2,1,1,2,3,4,5,3,3,2,5,5,3,4,3,2,3,3,2,2,2]
n = 5
user_scores = [0 for i in range(n)]
content_vecs = df['comp_embeddings']
content_vecs = [ast.literal_eval(r['comp_embeddings']) for _, r in df.iterrows()] # convert embeddings from a string containing the vector to just a vector
for i in range(n):
print(df.completion[i])
print("Rating:",end=" ")
rate = input()
user_scores[i] = int(rate)
print()
print(f"These are the user scores for the first {n} paragrahs: {user_scores}")
content_vecs = reduce_dim_pca(content_vecs)
x = np.array(content_vecs)[:len(user_scores)]
y = np.array(user_scores)
y = np.array([[_] for _ in y])
model.fit(x,y)
trained_uservec = model.coef_ # This is the user vector derived from the linear regression
vv = np.array(trained_uservec)
cc = np.array(content_vecs)
arr = cs(cc, vv) # cosine_similarity (weight vector, i'th content vector) where i increments from 0 to 351
arr = [a[0] for a in arr]
inds = np.argpartition(arr, -n)[-n:] # Creates a list of the indices of the top 15 ranked paragraphs
inds = list(inds)
res = pd.DataFrame(df.iloc[inds]['completion'])
# res_embeddings = [content_vecs[i] for i in inds]
# print(res)
final_user_scores = [0 for i in range(n)]
final_predicted_scores = [float(model.predict(np.array(content_vecs[inds[i]]).reshape((1,10)))[0][0]) for i in range(n)]
print(f"We've found {n} paragarphs that we would like you to rate.")
print("")
for i in range(n):
print(res['completion'][inds[i]])
print("Rating:",end=" ")
score = input()
final_user_scores[i] = int(score)
print()
print(f"These are the final user scores {final_user_scores}")
print(f"These are the predicted final scores {final_predicted_scores}")