from sentence_transformers import util
def search(query, k=5):
q = model.encode([query], normalize_embeddings=True)
sims = util.cos_sim(q, emb)[0].cpu().numpy()
idx = sims.argsort()[::-1][:k]
print(f'\n=== Query: "{query}" ===')
for rank, i in enumerate(idx, 1):
row = work.iloc[i]
print(f"\n[{rank}] sim={sims[i]:.3f} | {row['taxonomy_level_1']} "
f"| status={row['open_status']}")
print(" ", row[TEXT_COL][:260].replace("\n", " "), "...")
search("rational points on hyperelliptic curves")
search("multiplicativity of maximal output p-norm of a quantum channel")
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
y = work["open_status"].values
Xtr, Xte, ytr, yte = train_test_split(
emb, y, test_size=0.25, random_state=RANDOM_STATE, stratify=y)
clf = LogisticRegression(max_iter=2000, class_weight="balanced", C=2.0)
clf.fit(Xtr, ytr)
pred = clf.predict(Xte)
print("\n=== open_status classifier (embeddings + logistic regression) ===")
print(classification_report(yte, pred))
fig, ax = plt.subplots(figsize=(7, 6))
ConfusionMatrixDisplay.from_predictions(
yte, pred, ax=ax, cmap="Blues", xticks_rotation=45,
normalize="true", values_format=".2f")
ax.set_title("open_status confusion matrix (row-normalized)")
plt.tight_layout(); plt.show()
sims = util.cos_sim(emb, emb).cpu().numpy()
np.fill_diagonal(sims, 0)
i, j = np.unravel_index(sims.argmax(), sims.shape)
print(f"\nMost similar pair (cos={sims[i, j]:.3f}):")
for n in (i, j):
print(f"\n paper_id={work.iloc[n]['paper_id']} | "
f"{work.iloc[n]['taxonomy_level_1']}")
print(" ", work.iloc[n][TEXT_COL][:240].replace("\n", " "), "...")
print("\nDone. Set SAMPLE_SIZE=None at the top to run on the full 14.1k rows.")
LIVE NEWS
- House passes bill to provide more Ukraine aid and impose new sanctions on Russia
- A study of 8,300 older adults revealed a surprising salt habit
- Why 95% of enterprise GPUs sit idle while AI startups can’t get compute
- OpenAI to comply with Trump AI model review order: Osborne
- Building a Semantic Search Engine and Open-Status Classifier over the ResearchMath-14k Dataset
- Ice-sheet regime shifts with climate warming
- George Santos threatened me after I wrote about him : NPR
- AI PCs and HIPAA: Here’s What Healthcare Organizations Need to Know
