Source code for src.features.generator

import pandas as pd
from tqdm import tqdm
import logging
import numpy as np
import nltk
from src.embeddings.bert import Bert
from src.embeddings.tfidf import TFIDF
from src.embeddings.glove import Glove
from src.embeddings.word2vec import word2vec
from src.features.features import cosine_similarity_score, euclidean_distance_score, manhattan_distance_score, jaccard, \
    words, relative_difference, characters, difference, subjectivity, polarisation, POS
from src.utils.utils import load
from src.features.bm25 import BM25

nltk.download('averaged_perceptron_tagger')

tqdm.pandas()
LOGGER = logging.getLogger('generator')


[docs]def create_all(features: pd.DataFrame, collection: pd.DataFrame, queries: pd.DataFrame, tfidf=None, glove=None, bert=None, w2v=None): """ Creates all implemented embeddings (bert, glove, tfidf, word2vec) and features (cosine, euclidean, manhattan, jaccard, sentence, interpretation, BM25, POS). Args: features (pd.DataFrame): Dataframe containing feature data collection (pd.DataFrame): Dataframe containing collection data queries (pd.DataFrame): Dataframe containing query data tfidf (TFIDF object): Creates new object of class tfidf if None glove (Glove object): Creates new object of class Glove if None bert (Bert object): Creates new object of class Bert if None w2v (word2vec object): Creates new object of class word2vec if None Returns: features (pd.DataFrame): Dataframe containing feature data """ tfidf, _ = create_tfidf_embeddings(collection, tfidf=tfidf, name='collection') create_tfidf_embeddings(queries, tfidf=tfidf, name='query') glove, _ = create_glove_embeddings(collection, glove=glove, name='collection') create_glove_embeddings(queries, glove=glove, name='query') bert, _ = create_bert_embeddings(collection, bert=bert, name='collection') create_bert_embeddings(queries, bert=bert, name='query') w2v, _ = create_w2v_embeddings(collection, w2v=w2v, name='collection') create_w2v_embeddings(queries, w2v=w2v, name='query') create_w2v_embeddings_tf_idf_weighted(collection, w2v=w2v, name="collection") create_w2v_embeddings_tf_idf_weighted(queries, w2v=w2v, name="query") features = create_w2v_feature(features, collection, queries) features = create_w2v_tfidf_feature(features, collection, queries) features = create_tfidf_feature(features, collection, queries) features = create_bert_feature(features, collection, queries) features = create_glove_feature(features, collection, queries) features = create_jaccard_feature(features, collection, queries) features = create_sentence_features(features, collection, queries) features = create_interpretation_features(features, collection, queries) features = create_BM2_feature(features, collection, queries) return create_POS_features(features, collection, queries)
[docs]def create_tfidf_embeddings(data: pd.DataFrame, tfidf=None, name: str = ''): """ Creates tfidf embeddings Args: data (pd.DataFrame): Dataframe containing data to be embedded tfidf (str): Creates new object of class tfidf if None name (str): Adds string to name of the .pkl file created and stored of the data data frame Returns: tfidf (TFIDF object): Object of class TFIDF data (pd.DataFrame): Dataframe data with new column "preprocessed" appended """ if tfidf is None: tfidf = TFIDF() tfidf.fit( data['preprocessed'] ) data['tfidf'] = tfidf.transform( data['preprocessed'], f"data/embeddings/tfidf_{name}_embeddings.pkl") return tfidf, data
[docs]def create_glove_embeddings(data: pd.DataFrame, glove=None, name: str = ''): """ Creates glove embeddings Args: data (pd.DataFrame): Dataframe containing data to be embedded glove (str): Creates new object of class Glove if None name (str): Adds string to name of the .pkl file created and stored of the data Dataframe Returns: glove (Glove object): Object of class Glove data (pd.DataFrame): Dataframe data with new column "preprocessed" appended """ if glove is None: glove = Glove() data['glove'] = glove.transform( data['preprocessed'], f"data/embeddings/glove_{name}_embeddings.pkl") return glove, data
[docs]def create_glove_embeddings_tf_idf_weighted(data: pd.DataFrame, glove=None, name: str = ''): """ Creates tfidf weighted glove embeddings Args: data (pd.DataFrame): Dataframe containing data to be embedded glove (str): Creates new object of class Glove if None name (str): Adds string to name of the .pkl file created and stored of the data Dataframe Returns: glove (Glove object): Object of class Glove data (pd.DataFrame): Dataframe data with new column "glove_tfidf" appended """ if glove is None: glove = Glove() data['glove_tfidf'] = glove.transform_tfidfweighted( data['preprocessed'], data['tfidf'], f"data/embeddings/glove_tf_idf_{name}_embeddings.pkl") return glove, data
[docs]def create_bert_embeddings(data: pd.DataFrame, bert=None, name: str = ''): """ Creates bert embeddings Args: data (pd.DataFrame): Dataframe containing data to be embedded bert (str): Creates new object of class Bert if None name (str): Adds string to name of the .pkl file created and stored of the data Dataframe Returns: bert (Bert object): Object of class Bert data (pd.DataFrame): Dataframe data with new column "preprocessed" appended """ if bert is None: bert = Bert() column_name = "" if name == "collection": column_name = "Passage" if name == "query" or name == "query_test": column_name = "Query" data['bert'] = bert.transform( data[column_name], f"data/embeddings/bert_{name}_embeddings.pkl") return bert, data
[docs]def create_w2v_embeddings(data: pd.DataFrame, w2v=None, name: str = ''): """ Creates word2vec embeddings Args: data (pd.DataFrame): Dataframe containing data to be embedded w2v (str): Creates new object of class word2vec if None name (str): Adds string to name of the .pkl file created and stored of the data Dataframe Returns: w2v (word2vec object): Object of class word2vec data (pd.DataFrame): Dataframe data with new column "preprocessed" appended """ if w2v is None: w2v = word2vec() data['w2v'] = w2v.transform(data['preprocessed'], f"data/embeddings/w2v_{name}_embeddings.pkl") return w2v, data
[docs]def create_w2v_embeddings_tf_idf_weighted(data: pd.DataFrame, w2v=None, name: str = ''): """ Creates weighted tfidf word2vec embeddings Args: data (pd.DataFrame): Dataframe containing data to be embedded w2v (str): Creates new object of class word2vec if None name (str): Adds string to name of the .pkl file created and stored of the data Dataframe Returns: w2v (word2vec object): Object of class word2vec data (pd.DataFrame): Dataframe data with new column "w2v_tfidf" appended """ if w2v is None: w2v = word2vec() data['w2v_tfidf'] = w2v.transform_tf_idf_weighted(data['preprocessed'], data['tfidf'], f"data/embeddings/w2v_tfidf_{name}_embeddings.pkl") return w2v, data
[docs]def create_w2v_feature(features: pd.DataFrame, collection: pd.DataFrame, queries: pd.DataFrame, path_collection: str = 'data/embeddings/w2v_collection_embeddings.pkl', path_query: str = 'data/embeddings/w2v_query_embeddings.pkl'): """ Creates word2vec features (cosine, euclidean, manhattan) Args: features (pd.DataFrame): Dataframe containing feature data collection (pd.DataFrame): Dataframe containing collection data queries (pd.DataFrame): Dataframe containing queries data path_collection (str): Path to "w2v_collection_embeddings.pkl" file path_query (str): Path to "w2v_query_embeddings.pkl" file Returns: features (pd.DataFrame): Dataframe "features" with new columns "w2v_cosine", "w2v_euclidean", "w2v_manhattan" appended """ embeddings = np.array(load(path_collection)) embeddings_queries = np.array(load(path_query)) features['w2v_cosine'] = features.progress_apply(lambda qrel: cosine_similarity_score(embeddings_queries[ queries[ queries[ 'qID'] == qrel.qID].index], embeddings[collection[ collection[ 'pID'] == qrel.pID].index]), axis=1) features['w2v_euclidean'] = features.progress_apply(lambda qrel: euclidean_distance_score(embeddings_queries[ queries[ queries[ 'qID'] == qrel.qID].index], embeddings[ collection[ collection[ 'pID'] == qrel.pID].index]), axis=1) features['w2v_manhattan'] = features.progress_apply(lambda qrel: manhattan_distance_score(embeddings_queries[ queries[ queries[ 'qID'] == qrel.qID].index], embeddings[ collection[ collection[ 'pID'] == qrel.pID].index]), axis=1) return features
[docs]def create_w2v_tfidf_feature(features: pd.DataFrame, collection: pd.DataFrame, queries: pd.DataFrame, path_collection: str = 'data/embeddings/w2v_tfidf_collection_embeddings.pkl', path_query: str = 'data/embeddings/w2v_tfidf_query_embeddings.pkl'): """ Creates tfidf weighted word2vec features (cosine, euclidean, manhattan) Args: features (pd.DataFrame): Dataframe containing feature data collection (pd.DataFrame): Dataframe containing collection data queries (pd.DataFrame): Dataframe containing queries data path_collection (str): Path to "w2v_tfidf_collection_embeddings.pkl" file path_query (str): Path to "w2v_tfidf_query_embeddings.pkl" file Returns: features (pd.DataFrame): Dataframe "features" with new columns "w2v_tfidf_cosine", "w2v_tfidf_euclidean", "w2v_tfidf_manhattan" appended """ embeddings = np.array(load(path_collection)) embeddings_queries = np.array(load(path_query)) features['w2v_tfidf_cosine'] = features.progress_apply(lambda qrel: cosine_similarity_score(embeddings_queries[ queries[ queries[ 'qID'] == qrel.qID].index], embeddings[collection[ collection[ 'pID'] == qrel.pID].index]), axis=1) features['w2v_tfidf_euclidean'] = features.progress_apply(lambda qrel: euclidean_distance_score(embeddings_queries[ queries[ queries[ 'qID'] == qrel.qID].index], embeddings[ collection[ collection[ 'pID'] == qrel.pID].index]), axis=1) features['w2v_tfidf_manhattan'] = features.progress_apply(lambda qrel: manhattan_distance_score(embeddings_queries[ queries[ queries[ 'qID'] == qrel.qID].index], embeddings[ collection[ collection[ 'pID'] == qrel.pID].index]), axis=1) return features
[docs]def create_tfidf_feature(features: pd.DataFrame, collection: pd.DataFrame, queries: pd.DataFrame, path_collection: str = 'data/embeddings/tfidf_collection_embeddings.pkl', path_query: str = 'data/embeddings/tfidf_query_embeddings.pkl'): """ Creates tfidf features (cosine, euclidean, manhattan) Args: features (pd.DataFrame): Dataframe containing feature data collection (pd.DataFrame): Dataframe containing collection data queries (pd.DataFrame): Dataframe containing queries data path_collection (str): Path to "tfidf_collection_embeddings.pkl" file path_query (str): Path to "tfidf_query_embeddings.pkl" file Returns: features (pd.DataFrame): Dataframe "features" with new columns "tfidf_cosine", "tfidf_euclidean", "tfidf_manhattan" appended """ embeddings = load(path_collection) embeddings_queries = load(path_query) features['tfidf_cosine'] = features.progress_apply(lambda qrel: cosine_similarity_score(embeddings_queries[ queries[ queries[ 'qID'] == qrel.qID].index], embeddings[collection[ collection[ 'pID'] == qrel.pID].index]), axis=1) features['tfidf_euclidean'] = features.progress_apply(lambda qrel: euclidean_distance_score(embeddings_queries[ queries[ queries[ 'qID'] == qrel.qID].index], embeddings[ collection[ collection[ 'pID'] == qrel.pID].index]), axis=1) features['tfidf_manhattan'] = features.progress_apply(lambda qrel: manhattan_distance_score(embeddings_queries[ queries[ queries[ 'qID'] == qrel.qID].index], embeddings[ collection[ collection[ 'pID'] == qrel.pID].index]), axis=1) return features
[docs]def create_glove_feature(features: pd.DataFrame, collection: pd.DataFrame, queries: pd.DataFrame, path_collection: str = 'data/embeddings/glove_collection_embeddings.pkl', path_query: str = 'data/embeddings/glove_query_embeddings.pkl'): """ Creates glove features (cosine, euclidean, manhattan) Args: features (pd.DataFrame): Dataframe containing feature data collection (pd.DataFrame): Dataframe containing collection data queries (pd.DataFrame): Dataframe containing queries data path_collection (str): Path to "glove_collection_embeddings.pkl" file path_query (str): Path to "glove_query_embeddings.pkl" file Returns: features (pd.DataFrame): Dataframe "features" with new columns "glove_cosine", "glove_euclidean", "glove_manhattan" appended """ embeddings = np.array(load(path_collection)) embeddings_queries = np.array(load(path_query)) features['glove_cosine'] = features.progress_apply(lambda qrel: cosine_similarity_score(embeddings_queries[ queries[ queries[ 'qID'] == qrel.qID].index], embeddings[ collection[ collection[ 'pID'] == qrel.pID].index]), axis=1) features['glove_euclidean'] = features.progress_apply(lambda qrel: euclidean_distance_score(embeddings_queries[ queries[ queries[ 'qID'] == qrel.qID].index], embeddings[ collection[ collection[ 'pID'] == qrel.pID].index]), axis=1) features['glove_manhattan'] = features.progress_apply(lambda qrel: manhattan_distance_score(embeddings_queries[ queries[ queries[ 'qID'] == qrel.qID].index], embeddings[ collection[ collection[ 'pID'] == qrel.pID].index]), axis=1) return features
[docs]def create_bert_feature(features: pd.DataFrame, collection: pd.DataFrame, queries: pd.DataFrame, path_collection: str = 'data/embeddings/bert_collection_embeddings.pkl', path_query: str = 'data/embeddings/bert_query_embeddings.pkl'): """ Creates bert features (cosine, euclidean, manhattan) Args: features (pd.DataFrame): Dataframe containing feature data collection (pd.DataFrame): Dataframe containing collection data queries (pd.DataFrame): Dataframe containing queries data path_collection (str): Path to "bert_collection_embeddings.pkl" file path_query (str): Path to "bert_query_embeddings.pkl" file Returns: features (pd.DataFrame): Dataframe "features" with new columns "bert_cosine", "bert_euclidean", "bert_manhattan" appended """ embeddings = np.array(load(path_collection)) embeddings_queries = np.array(load(path_query)) features['bert_cosine'] = features.progress_apply(lambda qrel: cosine_similarity_score(embeddings_queries[ queries[ queries[ 'qID'] == qrel.qID].index], embeddings[ collection[ collection[ 'pID'] == qrel.pID].index]), axis=1) features['bert_euclidean'] = features.progress_apply(lambda qrel: euclidean_distance_score(embeddings_queries[ queries[ queries[ 'qID'] == qrel.qID].index], embeddings[ collection[ collection[ 'pID'] == qrel.pID].index]), axis=1) features['bert_manhattan'] = features.progress_apply(lambda qrel: manhattan_distance_score(embeddings_queries[ queries[ queries[ 'qID'] == qrel.qID].index], embeddings[ collection[ collection[ 'pID'] == qrel.pID].index]), axis=1) return features
[docs]def create_jaccard_feature(features: pd.DataFrame, collection: pd.DataFrame, queries: pd.DataFrame): """ Creates jaccard features for query-collection combinations Args: features (pd.DataFrame): Dataframe containing feature data collection (pd.DataFrame): Dataframe containing collection data queries (pd.DataFrame): Dataframe containing queries data Returns: features (pd.DataFrame): Dataframe "features" with new column "jaccard" appended """ features['jaccard'] = features.progress_apply( lambda qrel: jaccard(collection[collection['pID'] == qrel['pID']]['preprocessed'].iloc[0], queries[queries['qID'] == qrel['qID']]['preprocessed'].iloc[0]), axis=1) return features
[docs]def create_sentence_features(features: pd.DataFrame, collection: pd.DataFrame, queries: pd.DataFrame): """ Creates sentence features for query-collection combinations (words_difference, words_rel_difference, char_difference, char_rel_difference) Args: features (pd.DataFrame): Dataframe containing feature data collection (pd.DataFrame): Dataframe containing collection data queries (pd.DataFrame): Dataframe containing queries data Returns: features (pd.DataFrame): Dataframe "features" with new columns "words_doc", "words_query", "words_difference", "words_rel_difference" "char_doc", "char_query", "char_difference", "char_rel_difference" appended """ features['words_doc'] = features.progress_apply( lambda qrel: words(collection[collection['pID'] == qrel['pID']]['Passage'].iloc[0]), axis=1) features['words_query'] = features.progress_apply( lambda qrel: words(queries[queries['qID'] == qrel['qID']]['Query'].iloc[0]), axis=1) features['words_difference'] = features.progress_apply( lambda qrel: difference(qrel['words_doc'], qrel['words_query']), axis=1) features['words_rel_difference'] = features.progress_apply( lambda qrel: relative_difference(qrel['words_doc'], qrel['words_query']), axis=1) features['char_doc'] = features.progress_apply( lambda qrel: characters(collection[collection['pID'] == qrel['pID']]['Passage'].iloc[0]), axis=1) features['char_query'] = features.progress_apply( lambda qrel: characters(queries[queries['qID'] == qrel['qID']]['Query'].iloc[0]), axis=1) features['char_difference'] = features.progress_apply( lambda qrel: difference(qrel['char_doc'], qrel['char_query']), axis=1) features['char_rel_difference'] = features.progress_apply( lambda qrel: relative_difference(qrel['char_doc'], qrel['char_query']), axis=1) return features
[docs]def create_interpretation_features(features: pd.DataFrame, collection: pd.DataFrame, queries: pd.DataFrame): """ Creates interpretation features for query and collection data (subjectivity, polarity) Args: features (pd.DataFrame): Dataframe containing feature data collection (pd.DataFrame): Dataframe containing collection data queries (pd.DataFrame): Dataframe containing queries data Returns: features (pd.DataFrame): Dataframe "features" with new columns "subjectivity_doc", "polarity_doc", "subjectivity_query", "polarity_query" appended """ features['subjectivity_doc'] = features.progress_apply( lambda qrel: subjectivity(collection[collection['pID'] == qrel['pID']]['Passage'].iloc[0]), axis=1) features['polarity_doc'] = features.progress_apply( lambda qrel: polarisation(collection[collection['pID'] == qrel['pID']]['Passage'].iloc[0]), axis=1) features['subjectivity_query'] = features.progress_apply( lambda qrel: subjectivity(queries[queries['qID'] == qrel['qID']]['Query'].iloc[0]), axis=1) features['polarity_query'] = features.progress_apply( lambda qrel: polarisation(queries[queries['qID'] == qrel['qID']]['Query'].iloc[0]), axis=1) return features
[docs]def create_POS_features(features: pd.DataFrame, collection: pd.DataFrame, queries: pd.DataFrame): """ Creates Part of Speech features for query and collection data (nouns, adjectives, verbs) Args: features (pd.DataFrame): Dataframe containing feature data collection (pd.DataFrame): Dataframe containing collection data queries (pd.DataFrame): Dataframe containing queries data Returns: features (pd.DataFrame): Dataframe "features" with new columns "doc_nouns", "doc_adjectives", "doc_verbs", "query_nouns", "query_adjectives", "query_verbs" appended """ pos = features.progress_apply( lambda qrel: POS(collection[collection['pID'] == qrel['pID']]['Passage'].iloc[0]), axis=1) features['doc_nouns'] = [tag[0] for tag in pos] features['doc_adjectives'] = [tag[1] for tag in pos] features['doc_verbs'] = [tag[2] for tag in pos] pos = features.progress_apply( lambda qrel: POS(queries[queries['qID'] == qrel['qID']]['Query'].iloc[0]), axis=1) features['query_nouns'] = [tag[0] for tag in pos] features['query_adjectives'] = [tag[1] for tag in pos] features['query_verbs'] = [tag[2] for tag in pos] return features
[docs]def create_BM2_feature(features: pd.DataFrame, collection: pd.DataFrame, queries: pd.DataFrame): """ Creates BM25 features for query-collection combinations Args: features (pd.DataFrame): Dataframe containing feature data collection (pd.DataFrame): Dataframe containing collection data queries (pd.DataFrame): Dataframe containing queries data Returns: features (pd.DataFrame): Dataframe "features" with new column "bm25" appended """ bm25 = BM25().fit(collection['preprocessed']) features['bm25'] = features.progress_apply( lambda qrel: bm25.predict_proba(queries[queries['qID'] == qrel['qID']]['preprocessed'].iloc[0], collection[collection['pID'] == qrel['pID']]['preprocessed'].iloc[0]), axis=1) return features