Sessi 12 — fastText, Doc2Vec, & Semantic Search

Tujuan: memahami embedding kata & dokumen yang tahan terhadap typo/variasi morfologi (fastText), menyusun vektor dokumen langsung (Doc2Vec), serta membangun semantic search berbasis vektor.

Learning Outcomes: (1) Melatih fastText & Doc2Vec pada korpus kecil; (2) Menyusun indeks vektor untuk pencarian semantik; (3) Membandingkan dengan TF–IDF/Word2Vec pooling; (4) Evaluasi retrieval@k & analisis kesalahan.

1) Konsep Inti

  • fastText: model embedding dengan subword (n‑grams karakter); kuat untuk kata OOV/typo.
  • Doc2Vec: memperkenalkan vektor paragraf/dokumen (PV‑DM, PV‑DBOW) yang dipelajari langsung.
  • Semantic Search: temu kembali berdasarkan kemiripan vektor (cosine) ketimbang kecocokan kata literal.

2) Praktik Google Colab — Melatih fastText & Doc2Vec

Gunakan korpus pra‑proses (Sessi 3) minimal 30 dokumen. Kita siapkan juga query & ground truth sederhana untuk evaluasi retrieval.

A. Setup & Data

!pip -q install pandas numpy scikit-learn gensim matplotlib tabulate

import numpy as np, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from tabulate import tabulate
import matplotlib.pyplot as plt

# Muat korpus
try:
    df = pd.read_csv('corpus_sessi3_variants.csv')
    docs = df['v2_stop_stemID'].dropna().astype(str).tolist()
except:
    docs = pd.read_csv('corpus_sessi2_normalized.csv')['text'].dropna().astype(str).tolist()

print('Dokumen:', len(docs))

# Query & GT sederhana (ubah sesuai domain)
QUERIES = {
  'pengiriman cepat': ['pengiriman','cepat'],
  'refund cepat': ['refund','cepat','proses'],
  'screen glare': ['screen','glare','outdoor'],
  'bluetooth stabil': ['bluetooth','stabil'],
  'login delay': ['login','delay']
}

# Ground truth heuristik → daftar indeks dokumen relevan per query
GT = {}
for q, kws in QUERIES.items():
    rel = []
    for i, d in enumerate(docs):
        tok = set(d.split())
        if any(k in tok for k in kws):
            rel.append(i)
    GT[q] = sorted(set(rel))

print('Contoh GT:', {k: v[:3] for k,v in list(GT.items())[:2]})

B. Baseline: TF–IDF Retrieval

vec = TfidfVectorizer(ngram_range=(1,2), min_df=2, max_df=0.95, sublinear_tf=True, norm='l2')
X = vec.fit_transform(docs)

def search_tfidf(query, topk=5):
    q = vec.transform([query])
    sims = cosine_similarity(X, q).ravel()
    order = sims.argsort()[::-1][:topk]
    return [(i, float(sims[i]), docs[i]) for i in order]

C. fastText (Gensim)

from gensim.models import FastText

sentences = [d.split() for d in docs]
ft = FastText(
    vector_size=200, window=5, min_count=1, workers=2,
    sg=1, negative=10, min_n=3, max_n=5, seed=42
)
ft.build_vocab(sentences)
ft.train(sentences, total_examples=len(sentences), epochs=20)

# Vektor dokumen: rata-rata kata (dengan subword → robust typo)
import numpy as np

def docvec_fasttext(tokens):
    vs = [ft.wv[w] for w in tokens if w in ft.wv]
    return np.mean(vs, axis=0) if len(vs) else np.zeros(ft.vector_size)

FT = np.vstack([docvec_fasttext(d.split()) for d in docs])

from sklearn.preprocessing import normalize
FT = normalize(FT)  # agar cosine=dot

def search_fasttext(query, topk=5):
    qv = normalize(np.mean([ft.wv[w] for w in query.split() if w in ft.wv] or [np.zeros(ft.vector_size)], axis=0).reshape(1,-1))
    sims = (FT @ qv.T).ravel()
    order = sims.argsort()[::-1][:topk]
    return [(i, float(sims[i]), docs[i]) for i in order]

D. Doc2Vec (PV‑DBOW & PV‑DM)

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

corpus = [TaggedDocument(d.split(), [i]) for i,d in enumerate(docs)]

d2v_dbow = Doc2Vec(vector_size=200, window=5, min_count=1, workers=2, dm=0, negative=10, seed=42, epochs=30)
d2v_dbow.build_vocab(corpus)
d2v_dbow.train(corpus, total_examples=len(corpus), epochs=d2v_dbow.epochs)

# DM
d2v_dm = Doc2Vec(vector_size=200, window=5, min_count=1, workers=2, dm=1, negative=10, seed=42, epochs=30)
d2v_dm.build_vocab(corpus)
d2v_dm.train(corpus, total_examples=len(corpus), epochs=d2v_dm.epochs)

DV_DBOW = np.vstack([d2v_dbow.dv[i] for i in range(len(docs))])
DV_DM   = np.vstack([d2v_dm.dv[i]   for i in range(len(docs))])

DV_DBOW = normalize(DV_DBOW); DV_DM = normalize(DV_DM)

def search_doc2vec(model, query, topk=5):
    qv = normalize(model.infer_vector(query.split(), epochs=30).reshape(1,-1))
    M = DV_DBOW if model is d2v_dbow else DV_DM
    sims = (M @ qv.T).ravel()
    order = sims.argsort()[::-1][:topk]
    return [(i, float(sims[i]), docs[i]) for i in order]

E. Evaluasi Retrieval & Perbandingan

def precision_at_k(ranked, relset, k=5):
    return sum(1 for i in ranked[:k] if i in relset)/k

def evaluate(search_fn, name, topk=5):
    rows=[]
    for q in QUERIES:
        res = search_fn(q, topk)
        ranked = [i for i,_,_ in res]
        relset = set(GT[q])
        p5 = precision_at_k(ranked, relset, topk)
        rows.append((q, p5))
    df = pd.DataFrame(rows, columns=['query','P@'+str(topk)])
    df.loc['mean'] = ['mean', df.iloc[:,1].mean()]
    print('\n==', name, '==')
    print(df)

# Jalankan
evaluate(lambda q,k=5: search_tfidf(q,k), 'TF–IDF', 5)
evaluate(lambda q,k=5: search_fasttext(q,k), 'fastText‑avg', 5)
evaluate(lambda q,k=5: search_doc2vec(d2v_dbow,q,k), 'Doc2Vec‑DBOW', 5)
evaluate(lambda q,k=5: search_doc2vec(d2v_dm,q,k),   'Doc2Vec‑DM', 5)

F. (Opsional) SIF Sentence Embedding

# Smooth Inverse Frequency (SIF): pembobotan dengan a/(a+p(w)) dan penghapusan komponen utama pertama
from collections import Counter

# Estimasi frekuensi kata dari korpus
all_tokens = [w for d in docs for w in d.split()]
cnt = Counter(all_tokens)
N = sum(cnt.values())
p = {w: cnt[w]/N for w in cnt}

a = 1e-3

# Pakai fastText untuk vektor kata
W = []
for d in docs:
    toks = d.split()
    vs = []
    for w in toks:
        if w in ft.wv:
            weight = a/(a + p.get(w, 1e-9))
            vs.append(weight * ft.wv[w])
    W.append(np.mean(vs, axis=0) if vs else np.zeros(ft.vector_size))
W = np.vstack(W)

# Hilangkan PC pertama
from sklearn.decomposition import PCA
pc = PCA(n_components=1).fit(W).components_[0]
W_sif = W - (W @ pc[:,None]) * pc[None,:]
W_sif = normalize(W_sif)

def search_sif(query, topk=5):
    toks = query.split(); vs=[]
    for w in toks:
        if w in ft.wv:
            weight = a/(a + p.get(w,1e-9))
            vs.append(weight * ft.wv[w])
    qv = np.mean(vs, axis=0) if vs else np.zeros(ft.vector_size)
    qv = qv - (qv @ pc) * pc
    qv = normalize(qv.reshape(1,-1))
    sims = (W_sif @ qv.T).ravel()
    order = sims.argsort()[::-1][:topk]
    return [(i, float(sims[i]), docs[i]) for i in order]

evaluate(lambda q,k=5: search_sif(q,k), 'SIF‑fastText', 5)

G. Simpan Artefak

import joblib
ft.save('fasttext_sessi12.model')
d2v_dbow.save('doc2vec_dbow_sessi12.model')
d2v_dm.save('doc2vec_dm_sessi12.model')
joblib.dump({'QUERIES':QUERIES,'GT':GT}, 'semantic_search_gt_sessi12.joblib')
print('Tersimpan: fasttext_sessi12.model, doc2vec_*_sessi12.model, semantic_search_gt_sessi12.joblib')

3) Studi Kasus & Analisis

KasusPendekatanCatatan
Search internal dokumen kelasfastText‑avg / SIFTahan typo & kosakata campuran
Penelusuran FAQDoc2Vec‑DBOWRepresentasi dokumen langsung; infer cepat
Retrieval keluhanTF–IDF vs SIFUji keduanya; pilih berdasarkan P@k

4) Tugas Mini (Dinilai)

  1. Latih fastText(200D) & Doc2Vec(PV‑DBOW) pada korpus Anda; bandingkan P@5 untuk 5–10 query melawan baseline TF–IDF.
  2. Implementasikan SIF dan laporkan peningkatannya (jika ada) dibanding fastText‑avg.
  3. Analisis 5 hasil yang keliru (false positive/negative) dan usulkan perbaikan (stopword domain, n‑gram, augmentasi query).