Source code for src.features.bm25

import numpy as np
import pandas as pd
from tqdm import tqdm


[docs]class BM25(object): """ A class to create BM25 features. Methods: fit(corpus: pd.Series): Train model predict_proba(query: , document: ): Return confidence score bm25(word, document, k: int = 1, b: float = 0.75) Compute weight """ l_avg = None corpus = None corpus_length = None occurrences = {}
[docs] def fit(self, corpus: pd.Series): """ Fits the model. Args: corpus (pd.Series): Returns: none """ self.corpus = corpus self.l_avg = corpus.apply(lambda passage: passage.size).mean() self.corpus_length = self.corpus.size for i in tqdm(range(self.corpus.size)): for word in self.corpus[i]: if word in self.occurrences.keys(): self.occurrences[word] += 1 else: self.occurrences[word] = 1 return self
[docs] def predict_proba(self, query, document): """ Predict with confidence score. Args: query (): document (): Returns: score (float): """ assert self.corpus is not None, 'Fit the model first' relevancy = [] for word in query: if word in self.occurrences.keys(): weight = np.log(0.5 * self.occurrences[word] / self.corpus_length) relevancy.append(weight * self.bm25(word, document)) return sum(relevancy)
[docs] def bm25(self, word, document, k: int = 1, b: float = 0.75): """ Compute BM25 weight. Args: word (): document (): k (int): b (float): Returns: weight (float) """ term_frequency = np.count_nonzero(document == word) l = len(document) return (term_frequency * (k + 1)) / (term_frequency + k * l / self.l_avg * b + k * (1 - b))