Source code for src.embeddings.tfidf

from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from tqdm import tqdm
import numpy as np
from src.utils.utils import check_path_exists, save, load
import os


[docs]class TFIDF(object): """ A class to create tfidf embeddings. Attributes: path (str): Methods: fit(text_in_tokens: pd.Series, store: str = "models/tfidf.pkl"): Fits the tfidf model to the data transform(text_in_tokens: pd.Series, store: str = None): Transforms series of preprocessed tokens to tfidf embeddings """ vectorizer = None fitted = False def __init__(self, path: str = None): """ Constructs tfidf object. Args: path (str): Path of model """ if path is not None: self.vectorizer = load(path)
[docs] def fit(self, text_in_tokens: pd.Series, store: str = "models/tfidf.pkl"): """ Fits the tfidf model to the data. Args: text_in_tokens (pd.Series): Series of preprocessed tokens store (str): Path to store model to Returns: none """ def dummy(text): return text self.vectorizer = TfidfVectorizer(tokenizer=lambda text: text, lowercase=False) self.vectorizer.fit(text_in_tokens) if store is not None: check_path_exists(os.path.dirname(store)) save(self.vectorizer, store) self.fitted = True return self
[docs] def transform(self, text_in_tokens: pd.Series, store: str = None): """ Transform series of preprocessed tokens to tfidf embeddings. Args: text_in_tokens (pd.Series): Series of preprocessed tokens store (str): Path to tfidf embeddings to Returns: tf_idf_vec (np.array): Array containing tfidf embeddings """ assert self.vectorizer is not None, 'You need to fit me first' tfidf_matrix = self.vectorizer.transform(text_in_tokens) token_names = self.vectorizer.get_feature_names_out() tf_idf_dict = {} j = 0 for name in token_names: tf_idf_dict[name] = j j += 1 tf_idf_list = [] for i in tqdm(range(len(text_in_tokens))): tf_idf_token = {} for token in text_in_tokens.iloc[i]: if token in tf_idf_dict.keys(): tf_idf_token[token] = tfidf_matrix[i, tf_idf_dict[token]] else: tf_idf_token[token] = .0 tf_idf_list.append(tf_idf_token) tf_idf_vec = np.array(tf_idf_list) if store is not None: check_path_exists(os.path.dirname(store)) save(tfidf_matrix, store) return tf_idf_vec