Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
NLP-Web-App/model.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
65 lines (57 sloc)
2.67 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from http.client import OK | |
import pandas as pd | |
import numpy as np | |
from sklearn.linear_model import LinearRegression | |
import ast | |
from sklearn.metrics.pairwise import cosine_similarity as cs | |
from sklearn.manifold import TSNE | |
from sklearn.decomposition import PCA | |
df = pd.read_csv('dyslexia_embeddings_ada.csv') # This csv stores (in order): prompt, completion, prompt-no-labels embeddings, prompt embeddings, completion embeddings | |
model = LinearRegression() | |
# Following two methods are different ways of reducing the dimensionality of the large embeddings | |
def reduce_dim_tsne(content_vecs): | |
tsne = TSNE(n_components=3, perplexity=15, random_state=42, init='random', learning_rate=200) | |
return tsne.fit_transform(content_vecs) | |
def reduce_dim_pca(content_vecs): | |
pca = PCA(n_components=10) | |
return pca.fit_transform(content_vecs) | |
# This vector will store the user's ratings of each paragraph | |
# Brian's scores of the first 25 are [3,4,4,2,2,1,1,2,3,4,5,3,3,2,5,5,3,4,3,2,3,3,2,2,2] | |
n = 5 | |
user_scores = [0 for i in range(n)] | |
content_vecs = df['comp_embeddings'] | |
content_vecs = [ast.literal_eval(r['comp_embeddings']) for _, r in df.iterrows()] # convert embeddings from a string containing the vector to just a vector | |
for i in range(n): | |
print(df.completion[i]) | |
print("Rating:",end=" ") | |
rate = input() | |
user_scores[i] = int(rate) | |
print() | |
print(f"These are the user scores for the first {n} paragrahs: {user_scores}") | |
content_vecs = reduce_dim_pca(content_vecs) | |
x = np.array(content_vecs)[:len(user_scores)] | |
y = np.array(user_scores) | |
y = np.array([[_] for _ in y]) | |
model.fit(x,y) | |
trained_uservec = model.coef_ # This is the user vector derived from the linear regression | |
vv = np.array(trained_uservec) | |
cc = np.array(content_vecs) | |
arr = cs(cc, vv) # cosine_similarity (weight vector, i'th content vector) where i increments from 0 to 351 | |
arr = [a[0] for a in arr] | |
inds = np.argpartition(arr, -n)[-n:] # Creates a list of the indices of the top 15 ranked paragraphs | |
inds = list(inds) | |
res = pd.DataFrame(df.iloc[inds]['completion']) | |
# res_embeddings = [content_vecs[i] for i in inds] | |
# print(res) | |
final_user_scores = [0 for i in range(n)] | |
final_predicted_scores = [float(model.predict(np.array(content_vecs[inds[i]]).reshape((1,10)))[0][0]) for i in range(n)] | |
print(f"We've found {n} paragarphs that we would like you to rate.") | |
print("") | |
for i in range(n): | |
print(res['completion'][inds[i]]) | |
print("Rating:",end=" ") | |
score = input() | |
final_user_scores[i] = int(score) | |
print() | |
print(f"These are the final user scores {final_user_scores}") | |
print(f"These are the predicted final scores {final_predicted_scores}") |