Sessi 12 — fastText, Doc2Vec, & Semantic Search
Tujuan: memahami embedding kata & dokumen yang tahan terhadap typo/variasi morfologi (fastText), menyusun vektor dokumen langsung (Doc2Vec), serta membangun semantic search berbasis vektor.
Learning Outcomes: (1) Melatih fastText & Doc2Vec pada korpus kecil; (2) Menyusun indeks vektor untuk pencarian semantik; (3) Membandingkan dengan TF–IDF/Word2Vec pooling; (4) Evaluasi retrieval@k & analisis kesalahan.
1) Konsep Inti
- fastText: model embedding dengan subword (n‑grams karakter); kuat untuk kata OOV/typo.
- Doc2Vec: memperkenalkan vektor paragraf/dokumen (PV‑DM, PV‑DBOW) yang dipelajari langsung.
- Semantic Search: temu kembali berdasarkan kemiripan vektor (cosine) ketimbang kecocokan kata literal.
2) Praktik Google Colab — Melatih fastText & Doc2Vec
Gunakan korpus pra‑proses (Sessi 3) minimal 30 dokumen. Kita siapkan juga query & ground truth sederhana untuk evaluasi retrieval.
A. Setup & Data
!pip -q install pandas numpy scikit-learn gensim matplotlib tabulate
import numpy as np, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from tabulate import tabulate
import matplotlib.pyplot as plt
# Muat korpus
try:
df = pd.read_csv('corpus_sessi3_variants.csv')
docs = df['v2_stop_stemID'].dropna().astype(str).tolist()
except:
docs = pd.read_csv('corpus_sessi2_normalized.csv')['text'].dropna().astype(str).tolist()
print('Dokumen:', len(docs))
# Query & GT sederhana (ubah sesuai domain)
QUERIES = {
'pengiriman cepat': ['pengiriman','cepat'],
'refund cepat': ['refund','cepat','proses'],
'screen glare': ['screen','glare','outdoor'],
'bluetooth stabil': ['bluetooth','stabil'],
'login delay': ['login','delay']
}
# Ground truth heuristik → daftar indeks dokumen relevan per query
GT = {}
for q, kws in QUERIES.items():
rel = []
for i, d in enumerate(docs):
tok = set(d.split())
if any(k in tok for k in kws):
rel.append(i)
GT[q] = sorted(set(rel))
print('Contoh GT:', {k: v[:3] for k,v in list(GT.items())[:2]})
B. Baseline: TF–IDF Retrieval
vec = TfidfVectorizer(ngram_range=(1,2), min_df=2, max_df=0.95, sublinear_tf=True, norm='l2')
X = vec.fit_transform(docs)
def search_tfidf(query, topk=5):
q = vec.transform([query])
sims = cosine_similarity(X, q).ravel()
order = sims.argsort()[::-1][:topk]
return [(i, float(sims[i]), docs[i]) for i in order]
C. fastText (Gensim)
from gensim.models import FastText
sentences = [d.split() for d in docs]
ft = FastText(
vector_size=200, window=5, min_count=1, workers=2,
sg=1, negative=10, min_n=3, max_n=5, seed=42
)
ft.build_vocab(sentences)
ft.train(sentences, total_examples=len(sentences), epochs=20)
# Vektor dokumen: rata-rata kata (dengan subword → robust typo)
import numpy as np
def docvec_fasttext(tokens):
vs = [ft.wv[w] for w in tokens if w in ft.wv]
return np.mean(vs, axis=0) if len(vs) else np.zeros(ft.vector_size)
FT = np.vstack([docvec_fasttext(d.split()) for d in docs])
from sklearn.preprocessing import normalize
FT = normalize(FT) # agar cosine=dot
def search_fasttext(query, topk=5):
qv = normalize(np.mean([ft.wv[w] for w in query.split() if w in ft.wv] or [np.zeros(ft.vector_size)], axis=0).reshape(1,-1))
sims = (FT @ qv.T).ravel()
order = sims.argsort()[::-1][:topk]
return [(i, float(sims[i]), docs[i]) for i in order]
D. Doc2Vec (PV‑DBOW & PV‑DM)
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
corpus = [TaggedDocument(d.split(), [i]) for i,d in enumerate(docs)]
d2v_dbow = Doc2Vec(vector_size=200, window=5, min_count=1, workers=2, dm=0, negative=10, seed=42, epochs=30)
d2v_dbow.build_vocab(corpus)
d2v_dbow.train(corpus, total_examples=len(corpus), epochs=d2v_dbow.epochs)
# DM
d2v_dm = Doc2Vec(vector_size=200, window=5, min_count=1, workers=2, dm=1, negative=10, seed=42, epochs=30)
d2v_dm.build_vocab(corpus)
d2v_dm.train(corpus, total_examples=len(corpus), epochs=d2v_dm.epochs)
DV_DBOW = np.vstack([d2v_dbow.dv[i] for i in range(len(docs))])
DV_DM = np.vstack([d2v_dm.dv[i] for i in range(len(docs))])
DV_DBOW = normalize(DV_DBOW); DV_DM = normalize(DV_DM)
def search_doc2vec(model, query, topk=5):
qv = normalize(model.infer_vector(query.split(), epochs=30).reshape(1,-1))
M = DV_DBOW if model is d2v_dbow else DV_DM
sims = (M @ qv.T).ravel()
order = sims.argsort()[::-1][:topk]
return [(i, float(sims[i]), docs[i]) for i in order]
E. Evaluasi Retrieval & Perbandingan
def precision_at_k(ranked, relset, k=5):
return sum(1 for i in ranked[:k] if i in relset)/k
def evaluate(search_fn, name, topk=5):
rows=[]
for q in QUERIES:
res = search_fn(q, topk)
ranked = [i for i,_,_ in res]
relset = set(GT[q])
p5 = precision_at_k(ranked, relset, topk)
rows.append((q, p5))
df = pd.DataFrame(rows, columns=['query','P@'+str(topk)])
df.loc['mean'] = ['mean', df.iloc[:,1].mean()]
print('\n==', name, '==')
print(df)
# Jalankan
evaluate(lambda q,k=5: search_tfidf(q,k), 'TF–IDF', 5)
evaluate(lambda q,k=5: search_fasttext(q,k), 'fastText‑avg', 5)
evaluate(lambda q,k=5: search_doc2vec(d2v_dbow,q,k), 'Doc2Vec‑DBOW', 5)
evaluate(lambda q,k=5: search_doc2vec(d2v_dm,q,k), 'Doc2Vec‑DM', 5)
F. (Opsional) SIF Sentence Embedding
# Smooth Inverse Frequency (SIF): pembobotan dengan a/(a+p(w)) dan penghapusan komponen utama pertama
from collections import Counter
# Estimasi frekuensi kata dari korpus
all_tokens = [w for d in docs for w in d.split()]
cnt = Counter(all_tokens)
N = sum(cnt.values())
p = {w: cnt[w]/N for w in cnt}
a = 1e-3
# Pakai fastText untuk vektor kata
W = []
for d in docs:
toks = d.split()
vs = []
for w in toks:
if w in ft.wv:
weight = a/(a + p.get(w, 1e-9))
vs.append(weight * ft.wv[w])
W.append(np.mean(vs, axis=0) if vs else np.zeros(ft.vector_size))
W = np.vstack(W)
# Hilangkan PC pertama
from sklearn.decomposition import PCA
pc = PCA(n_components=1).fit(W).components_[0]
W_sif = W - (W @ pc[:,None]) * pc[None,:]
W_sif = normalize(W_sif)
def search_sif(query, topk=5):
toks = query.split(); vs=[]
for w in toks:
if w in ft.wv:
weight = a/(a + p.get(w,1e-9))
vs.append(weight * ft.wv[w])
qv = np.mean(vs, axis=0) if vs else np.zeros(ft.vector_size)
qv = qv - (qv @ pc) * pc
qv = normalize(qv.reshape(1,-1))
sims = (W_sif @ qv.T).ravel()
order = sims.argsort()[::-1][:topk]
return [(i, float(sims[i]), docs[i]) for i in order]
evaluate(lambda q,k=5: search_sif(q,k), 'SIF‑fastText', 5)
G. Simpan Artefak
import joblib
ft.save('fasttext_sessi12.model')
d2v_dbow.save('doc2vec_dbow_sessi12.model')
d2v_dm.save('doc2vec_dm_sessi12.model')
joblib.dump({'QUERIES':QUERIES,'GT':GT}, 'semantic_search_gt_sessi12.joblib')
print('Tersimpan: fasttext_sessi12.model, doc2vec_*_sessi12.model, semantic_search_gt_sessi12.joblib')
3) Studi Kasus & Analisis
| Kasus | Pendekatan | Catatan |
|---|---|---|
| Search internal dokumen kelas | fastText‑avg / SIF | Tahan typo & kosakata campuran |
| Penelusuran FAQ | Doc2Vec‑DBOW | Representasi dokumen langsung; infer cepat |
| Retrieval keluhan | TF–IDF vs SIF | Uji keduanya; pilih berdasarkan P@k |
4) Tugas Mini (Dinilai)
- Latih fastText(200D) & Doc2Vec(PV‑DBOW) pada korpus Anda; bandingkan P@5 untuk 5–10 query melawan baseline TF–IDF.
- Implementasikan SIF dan laporkan peningkatannya (jika ada) dibanding fastText‑avg.
- Analisis 5 hasil yang keliru (false positive/negative) dan usulkan perbaikan (stopword domain, n‑gram, augmentasi query).