import string
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import pandas as pd
from sklearn.decomposition import PCA
import logging
import numpy as np
from sklearn.preprocessing import StandardScaler
import random
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
LOGGER = logging.getLogger('Preprocessor')
[docs]def preprocess(data: pd.Series, expansion: bool = False):
""" Preprocess Text using tokenization, removing punctuation and stopwords, text expansion, stemming
Args:
data (pd.Series): Series of strings
expansion (bool): Decide whether to use word expansion on data or not
Returns:
data (pd.Series): Series of np.arrays containing preprocessed text
"""
LOGGER.info('Preprocessing ...')
if expansion:
return data.progress_apply(lambda text: np.array(
stemming(
query_expansion(
removal(
tokenization(text)
)))))
else:
return data.progress_apply(lambda text: np.array(
stemming(
removal(
tokenization(text)
))))
[docs]def tokenization(text: str):
""" Tokenize using nltk.word_tokenize method and lower string
Args:
text (str): String of text
Returns:
(pd.Series): Series containing lowered tokens
"""
return pd.Series(nltk.word_tokenize(text.lower()))
[docs]def removal(tokens: pd.Series):
""" Remove punctuation, stopwords and NA values
Args:
tokens (pd.Series): Series of tokens
Returns:
tokens (pd.Series): Series containing tokens with punctuation, stopwords and NA values removed
"""
stopwords_list = stopwords.words("english")
tokens = tokens.apply(lambda token: token.translate(str.maketrans('', '', string.punctuation)))
tokens = tokens.apply(lambda token: token if token not in stopwords_list and token != '' else None).dropna()
return tokens
[docs]def stemming(tokens: pd.Series):
""" Stem tokens using nltk PorterStemmer method
Args:
tokens (pd.Series): Series of tokens
Returns:
tokens (pd.Series): Series containing stemmed tokens
"""
stemmer = PorterStemmer()
return tokens.apply(lambda token: stemmer.stem(token))
[docs]def lemmatization(tokens: pd.Series):
""" Lemmatize tokens using nltk WordNetLemmatizer method
Args:
tokens (pd.Series): Series of tokens
Returns:
tokens (pd.Series): Series containing lemmatized tokens
"""
lemmatizer = WordNetLemmatizer()
return tokens.apply(lambda token: lemmatizer.lemmatize(token))
[docs]def pca(features: pd.DataFrame, components: int = 5):
"""
Args:
features (pd.Series):
components (int):
Returns:
(pd.DataFrame):
"""
pca = PCA(components)
columns = ['pca_comp_%i' % i for i in range(components)]
return pd.DataFrame(pca.fit_transform(features), columns=columns, index=features.index)
[docs]def split_and_scale(X_y_train, X_test, X_val=None, components_pca: int = 0):
"""
Args:
X_y_train ():
X_test ():
X_val: ():
components_pca ():
Returns:
X ():
y ():
X_test ():
test_pair ():
"""
dataframes = []
y = X_y_train['y']
X = X_y_train.drop(columns=['qID', 'pID', 'y'])
dataframes.append(X)
test_pair = X_test[['pID', 'qID']]
X_test = X_test.drop(columns=['pID', 'qID'])
dataframes.append(X_test)
if X_val is not None:
val_pair = X_val[['pID', 'qID']]
X_val = X_val.drop(columns=['pID', 'qID'])
dataframes.append(X_val)
scaler = StandardScaler()
data = pd.DataFrame(scaler.fit_transform(pd.concat(dataframes)), columns=X.columns)
if components_pca > 0:
data = pca(data, components_pca)
X = data.loc[:len(X) - 1]
X_test = data.loc[len(X):len(X) + len(X_test) - 1]
if X_val is not None:
X_val = data.loc[len(X) + len(X_test):]
return X, y, X_test, test_pair, X_val, val_pair
return X, y, X_test, test_pair
[docs]def query_expansion(tokens: pd.Series, sample_size=2):
""" Expand series of tokens with synonyms
Args:
tokens (pd.Series): Series of tokens
sample_size (int):
Returns:
new_tokenlist (pd.Series):
"""
token_list = tokens.tolist()
new_tokenlist = []
for token in token_list:
synonyms = get_synonyms(token, sample_size)
new_tokenlist.append(token)
if len(synonyms) > 0:
new_tokenlist.extend(synonyms)
return pd.Series(new_tokenlist)
[docs]def get_synonyms(phrase, sample_size):
""" Create synonyms using wordnets sysnsets method
Args:
phrase (str):
sample_size (int):
Returns:
synonym_set (list): List containing sysnonyms for given phrase. Only returned if sample_size > set of sysnonyms for phrase
synonym_set_sampled (list): List containing sampled sysnonyms for given phrase
"""
synonyms = []
for syn in wordnet.synsets(phrase):
for l in syn.lemmas():
if '_' not in l.name() and l.name() != phrase:
synonyms.append(l.name())
synonym_set = set(synonyms)
if sample_size > len(synonym_set):
return list(synonym_set)
else:
synonym_set_sampled = random.sample(synonym_set, sample_size)
return list(synonym_set_sampled)