Initial commit.

Yeah I know there are a lot of nonessential files but w/e.
This commit is contained in:
Jacob Signorovitch 2024-12-17 01:39:52 -05:00
commit 2cb11e4933
18 changed files with 38916 additions and 0 deletions

5
.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
venv
.env
nohup.out
__pycache__
.ropeproject

5252
Delta Normal file

File diff suppressed because it is too large Load Diff

5502
NN Normal file

File diff suppressed because it is too large Load Diff

5
README.md Normal file
View File

@ -0,0 +1,5 @@
# ASCO Abstract Success Predictor
Simple web interface to predict the chance a given abstract will be accepted for
oral presentation, based only on its title. Code is currently in shambles, I
might bring it up to my current standards some day.

28
app.py Normal file
View File

@ -0,0 +1,28 @@
from flask import Flask, redirect, render_template, request, url_for
import fit
app = Flask(__name__)
@app.route("/", methods=("GET", "POST"))
def index():
if request.method == "POST":
title = request.form["title"]
embedding = fit.get(title);
#nclose = request.form["nclose"]
percent = fit.percent(embedding);
closest = fit.closest(embedding, 10);
tprob = closest[1]
return redirect(url_for("index", tprob=closest[1], result=percent, title=title, closest=closest[0]))
result = request.args.get("result")
title = request.args.get("title")
closest = request.args.get("closest")
#nclose = request.args.get("nclose")
tprob = request.args.get("tprob")
if title == None: title = ""
#if nclose == None: nclose = 10
return render_template("index.html", tprob=tprob, result=result, title=title, closest=closest)

1536
beta Normal file

File diff suppressed because it is too large Load Diff

5252
embeddings2.nsv Normal file

File diff suppressed because one or more lines are too long

58
fit.py Normal file
View File

@ -0,0 +1,58 @@
import numpy as ν
import csv
import get_emb
MAX_RES = 10
φ = open('beta', 'r'); # coefficients
Δ = [float(el) for el in (open('Delta', 'r').read().split('\n')[:-1])]
#α = -0.8569279; # some magic constant
α = 0;
M = open('titles2.text', 'r').read().strip().split('\n') # TITLE\nAUTHORS\nACCEPTED?
T = ν.array(M[0::4])
A = ν.array(M[1::4])
O = ν.array(M[2::4])
X = ν.loadtxt(open('embeddings2.nsv', 'rb'), delimiter=',', skiprows=0)
NN = open('NN', 'r').read().split('\n')
β = φ.read().split('\n');
β = β[:-1]
β = [float(el) for el in β]
def get(θ):
return(get_emb.get_embedding(θ))
def percent(χ):
γ = α + ν.dot(χ, β)
π = ν.exp(γ) / (1 + ν.exp(γ))
return(str(π)[2:4] + '%')
def closest(χ, n):
n = abs(n)
n = n % MAX_RES
if n == 0: n = MAX_RES
ψ = ν.array(ν.dot(X, χ))
topn = T[ν.argsort(ψ)[-n:]]
aopn = A[ν.argsort(ψ)[-n:]]
oopn = O[ν.argsort(ψ)[-n:]]
print(ν.argsort(ψ)[-n:])
out = ""#"tail prob = " + str(percentile_far(ν.max(ψ))) + "\n"
for i in reversed(range(len(topn))):
if oopn[i] == "TRUE":
p = "presented"
else:
p = "online-only"
out += topn[i] + " <i>(" + aopn[i] + ", " + p + ")</i>\n"
tailprob = int(percentile_far(ν.max(ψ))*100)
return [out, tailprob]
def percentile_far(q_dist):
return sum(1*(ν.array(Δ)<=q_dist)) / len(Δ) # fraction of abstracts further from their nearest neighbor than χ

37
fit.r Executable file
View File

@ -0,0 +1,37 @@
library('glmnet')
library('pROC')
library('survival')
inv.logit <- function(x) {exp(x) / (1 + exp(x))}
read.table('titles.text', header=FALSE, quote='', sep="\n") -> D
matrix(as.vector(D[,1]), 3, nrow(D) / 3) -> M
M <- t(M)
X <- as.matrix(read.csv('embeddings.nsv', header=F)) # matrix of embeddings
# remove duplicates (if present)
X <- X[!duplicated(M[,1]),]
M <- M[!duplicated(M[,1]),]
write.table(as.vector(t(cbind(M,""))), "titles2.text", sep="\n", row.names=FALSE, col.names=FALSE, quote=F)
write.table(X, "embeddings2.nsv", sep=",", row.names=F, col.names=F, quote=F);
Y <- as.numeric(as.logical(M[,3]))
V <- cv.glmnet(x=X, y=Y, family="binomial", type.measure="auc")
Z <- glmnet(x=X, y=Y, lambda=V$lambda.min, family="binomial")
A <- predict(Z, newx=X, type="response")
B <- glm(Y~A, family="binomial")
R <- roc(Y, B$fitted.values)
beta <- Z$beta;
#P <- B$fitted.values
#O1 <- X %*% as.vector(Z$beta)
#O2 <- B$coefficients[1] + B$coefficients[2] * O1
#O3 <- inv.logit(exp(1) + O2)
#beta <- as.vector(Z$beta) * B$coefficients[2];
#alpha <- B$coefficients[1] + exp(1);
δ <- X %*% t(X)
diag(δ) <- NA
Δ <- apply(δ, 1, max, na.rm=T)
write.table(as.vector(beta), "beta", sep="\n", row.names=F, col.names=F)
write.table(Δ, "Delta", sep="\n", row.names=F, col.names=F)

5
get_emb.py Normal file
View File

@ -0,0 +1,5 @@
import openai
def get_embedding(text, model="text-embedding-ada-002"):
client = openai.OpenAI();
return client.embeddings.create(input = [text], model=model).data[0].embedding

2
gunicorn_config.py Normal file
View File

@ -0,0 +1,2 @@
bind = "0.0.0.0:80"
workers = 2

3
run.sh Executable file
View File

@ -0,0 +1,3 @@
#!/bin/sh
gunicorn --bind=0.0.0.0:8077 --worker-tmp-dir /dev/shm app:app

27
static/about.html Normal file
View File

@ -0,0 +1,27 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<link rel="stylesheet" href="main.css">
<title>About</title>
</head>
<body>
<center><h1>About</h1></center>
<h2>The Data</h2>
<p>
Information was scraped from <a href="//ascopubs.org/jco/meeting">this page</a> on 2023-06-28.
Over 5000 abstract titles from 2023 were used as training data.
Embeddings for these were generated through the OpenAI API, with the <code>text-embedding-ada-002</code> model.
</p>
<h2>The Model</h2>
<p>
A penalized logistic regression model was fit using the <code>glmnet</code> R package.
The tuning parameter was selected using cross validation.
The area under the ROC curve was 0.83 in the training data.
</p>
<center><a href="..">back</a></center>
</body>
</html>

BIN
static/gruv-4.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 536 KiB

129
static/main.css Normal file
View File

@ -0,0 +1,129 @@
/* global */
:root {
font-family: monospace;
--fgm: #bdae93;
--bgm: #1d2021;
--bgs: #282828;
--bgr: #32302f;
}
* {
border-radius: 0px !important;
}
/* background */
html {
background-color: var(--bgm);
padding: 0;
margin: 0;
display: flex;
justify-content: center;
font-size: 16px;
}
/* page */
body {
background-color: var(--bgs);
color: var(--fgm);
margin: 0;
padding-inline: 2em;
padding-block: 1em;
height: fit-content;
min-height: 100vh;
max-width: 50em;
flex-grow: 1;
font-size: 18px;
}
/* headers */
h1 {
color: #fb4934;
} h2 {
color: #fabd2f;
} h3 {
color: #b8bb26;
} h4 {
color: #8ec07c
} h5 {
color: #83a598;
} h1, h2,
h3, h4,
h5 {
font-weight: bold;
}
/* text styles */
b, i, u {
color: #d79921;
}
/* links */
a {
color: #458588;
} a:hover {
color: var(--bgs);
background-color: #458588;
text-decoration: none;
}
/* preformatted */
pre,code {
color: #8ec07c;
background-color: var(--bgr);
width: min-content;
}
/* misc */
.c {
color: #458588;
transition: filter 0.2s;
cursor: help;
} .c:hover {
filter: blur(2px)
}
form {
margin: 1em;
padding: 1em;
}
input[type="text"],
textarea,
input[type="number"],
input[type="submit"] {
background-color: var(--bgr);
border: none;
outline: none;
color: #8ec07c !important;
font-size: 1em;
font-family: monospace;
width: min-content !important;
}
textarea:active,
textarea:focus,
input[type="submit"]:active,
input[type="submit"]:focus,
input[type="text"]:active,
input[type="number"]:active,
input[type="text"]:focus,
input[type="number"]:focus {
background-color: #83c07c;
color: var(--bgs) !important;
border: none;
outline: none;
font-size: 1em;
font-family: monospace;
}
input {
margin: 0.2em;
border-radius: 0px !important;
}
input[type="submit"],
input[type="submit"]:active,
input[type="submit"]:focus {
border: 3px solid #83c07c
}

BIN
static/stairs.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.0 MiB

67
templates/index.html Normal file
View File

@ -0,0 +1,67 @@
<!DOCTYPE html>
<head>
<title>ASCO Abstract Title Evaluator</title>
<link rel="stylesheet" href="static/main.css">
</head>
<body>
<center><h1>ASCO Abstract Title Evaluator</h1></center>
<center><form action="/" method="post">
<textarea rows="3"
cols="64"
name="title"
placeholder="Enter abstract title..."
required>{{ title }}</textarea><br>
<input type="submit" value="Evaluate"/>
</form></center>
{% if tprob %}
<center style="margin-inline:10em; text-align: justify;"><p>
<h3>{{ tprob }}% Similarity</h3>
The similarity of your abstract title to accepted
ASCO 2023 titles is in this percentile.
</p></center>
<br>
{% endif %}
{% if result %}
<center style="margin-inline:10em; text-align: justify;"><p>
<h3>{{ result }} Chance of on-site presentation</h3>
Probability of on-site presentation
versus online only publication of your
abstract, based on the title, if
accepted.
{% if tprob|int < 5 %}
<p><i>Warning: the similarity is too low to be confident in this prediction.</i></p><br>
{% endif %}
</p></center>
<br>
{% endif %}
{% if closest %}
{% set closed = closest.split('\n')[:-1] %}
<h3>Closest {{ closed|length }} titles from ASCO 2023:</h3>
<ol>
{% for close in closed %}
<li>{{ close|safe }}</li>
{% endfor %}
</ol>
{% endif %}
<center><a href="{{ url_for('static', filename='about.html') }}">about</a></center>
<script>
function sub(event) {
if (event.key == 'Enter' && !event.shiftKey) {
if (!event.repeat) {
const newEvent = new Event("submit", {cancelable: true});
event.target.form.dispatchEvent(newEvent);
}
event.preventDefault();
}
}
document.querySelector("textarea").addEventListener("keydown", sub);
</script>
</body>

21008
titles2.text Normal file

File diff suppressed because it is too large Load Diff