Source code for src.features.features

from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from src.data.preprocessing import tokenization
import nltk
from textblob import TextBlob
import numpy as np


[docs]def cosine_similarity_score(embedding_1: list, embedding_2: list): """ Calculates cosine similarity between two word embeddings. Args: embedding_1 (list): List contaning word embedding embedding_2 (list): List contaning word embedding Returns: cosine_similarity (float): Cosine similarity between two word embeddings """ return cosine_similarity(embedding_1, embedding_2)[0][0]
[docs]def euclidean_distance_score(embedding_1: list, embedding_2: list): """ Calculates euclidean distance between two word embeddings. Args: embedding_1 (list): List contaning word embedding embedding_2 (list): List contaning word embedding Returns: euclidean_distances (float): Euclidean distance between two word embeddings """ return euclidean_distances(embedding_1, embedding_2)[0][0]
[docs]def manhattan_distance_score(embedding_1: list, embedding_2: list): """ Calculates manhatten distance between two word embeddings. Args: embedding_1 (list): List contaning word embedding embedding_2 (list): List contaning word embedding Returns: manhattan_distances (float): Manhatten distance between two word embeddings """ return manhattan_distances(embedding_1, embedding_2)[0][0]
[docs]def jaccard(token_vector_1: list, token_vector_2: list): """ Calculates jaccard coefficient between two lists of tokens Args: token_vector_1 (list): List contaning tokens token_vector_2 (list): List contaning tokens Returns: (float): Jaccard coefficient as float """ intersect = set(token_vector_1).intersection(set(token_vector_2)) union = set(token_vector_1).union(set(token_vector_2)) try: return len(intersect) / len(union) except ZeroDivisionError: return 0
[docs]def characters(sentence: str): """ Returns length of sentence, not considering whitespaces Args: sentence (str): Sentence as string Returns: (int): Length of sentence as int """ return len(sentence.replace(' ', ''))
[docs]def words(sentence: str): """ Returns number of words in a sentence Args: sentence (str): Sentence as string Returns: (int): Number of words in a sentence as int """ tokens = list(tokenization(sentence)) return len(tokens)
[docs]def POS(sentence: str): """ Returns Number of nouns, adjectives and verbs in a sentence Args: sentence (str): Sentence as string Returns: nouns (int): Number of nouns adj (int): Number of adjectives vetbs (int): Number of verbs """ tokens = list(tokenization(sentence)) tags = nltk.pos_tag(tokens) nouns = len([tag[0] for tag in tags if tag[1].startswith('NN')]) adj = len([tag[0] for tag in tags if tag[1].startswith('JJ')]) verbs = len([tag[0] for tag in tags if tag[1].startswith('VB')]) return nouns, adj, verbs
[docs]def subjectivity(sentence: str): """ Returns the subjectivity of a sentence Args: sentence (str): Sentence as string Returns: subjectivity (int): float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective """ return TextBlob(sentence).sentiment.subjectivity
[docs]def polarisation(sentence :str): """ Returns the polarisation of a sentence Args: sentence (str): Sentence as string Returns: polarisation (int): float within the range [-1.0, 1.0] """ return TextBlob(sentence).sentiment.polarity
[docs]def difference(doc_count, query_count): """ Returns the absolut difference between doc_count and query_count Args: doc_count (int): query_count (int): Returns: (int): Absolut difference between doc_count and query_count """ return np.abs(doc_count - query_count)
[docs]def relative_difference(doc_count, query_count): """ Returns the relative difference between doc_count and query_count Args: doc_count (int): query_count (int): Returns: (float): Relative difference between doc_count and query_count """ return doc_count / query_count