# pip install faiss-cpu sentence-transformers
from sentence_transformers import SentenceTransformer
import faiss
# replace with own texts - this is a bad example since it contains only single words
with open("/usr/share/dict/words", mode="r") as infile:
corpus = { num: s.strip() for num, s in enumerate(infile.readlines()) }
# encode the corpus using a good sentence transformer model - will be slow if no GPU
model = SentenceTransformer("all-mpnet-base-v2")
corpus_vectors = model.encode(sentences=list(corpus.values()))
# construct a faiss kNN index
num_vectors, num_dimensions = corpus_vectors.shape
index = faiss.index_factory(num_dimensions, "L2norm,Flat")
index.add(corpus_vectors)
# optional: save index for reuse
faiss.write_index(index, "/tmp/corpus_index.bin")
# index = faiss.read_index("/tmp/corpus_index.bin")
# encode target text and find 10 nearest neighbors in index
target_vector = model.encode(sentences=["apples"])
distances, nearest_indexes = index.search(target_vector, 10)
print(list(zip([corpus[i] for i in nearest_indexes[0]], distances[0])))
# [('apples', 4.382169e-13), ('fruits', 0.47413948), ('fruit', 0.57227534), ...