Source code for src.embeddings.glove
from flair.embeddings import WordEmbeddings
from flair.data import Sentence
import pandas as pd
from src.utils.utils import check_path_exists, save
import os
from tqdm import tqdm
import torch
from nltk.tokenize.treebank import TreebankWordDetokenizer
[docs]class Glove(object):
""" A class to create glove embeddings.
Methods:
transform(text_in_tokens: pd.Series, store: str = None)
Transform series of preprocessed tokens to glove embeddings
"""
glove = None
def __init__(self):
""" Constructs glove object using a pretrained model. """
self.glove = WordEmbeddings('glove')
[docs] def transform(self, text_in_tokens: pd.Series, store: str = None):
""" Transform series of preprocessed tokens to glove embeddings.
Args:
text_in_tokens (pd.Series): Series of preprocessed tokens
Returns:
glove_vec (list): List containing glove embeddings
"""
glove_vec = []
for line in tqdm(text_in_tokens):
detokenized = TreebankWordDetokenizer().detokenize(line)
sentence = Sentence(detokenized)
self.glove.embed(sentence)
input = torch.empty(sentence[0].embedding.size())
token_per_sentence = torch.zeros_like(input)
for token in sentence:
token_per_sentence = torch.add(token_per_sentence, token.embedding)
glove_vec.append(token_per_sentence.numpy())
if store is not None:
check_path_exists(os.path.dirname(store))
save(glove_vec, store)
return glove_vec