Source code for src.models.pairwise

import torch.nn as nn
import torch
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from itertools import combinations


[docs]def create_dataloader(X, y, batch_size: int = 50) -> DataLoader: """ . Args: X (): y (): batch_size (int): Returns: : """ X_relevant = X[y.reset_index(drop=True) == 1] X_irrelevant = X[y.reset_index(drop=True) == 0] dataset = TensorDataset(torch.tensor(pd.concat([X_relevant, X_irrelevant]).values).float(), torch.tensor(pd.concat([X_irrelevant, X_relevant]).values).float(), torch.tensor(y.values).float()) return DataLoader(dataset, batch_size=batch_size, shuffle=True)
[docs]def train_pairwise(network, X, y, num_epochs: int = 10): """ . Args: network () X (int): y (int): num_epochs (int): Returns: : """ train_loader = create_dataloader(X, y) optimizer = torch.optim.Adam(network.parameters()) criterion = nn.BCELoss() losses = [] batch_processed_counter = 0 for epoch in range(num_epochs): for step, (X_relevant, X_irrelevant, y) in enumerate(train_loader): batch_processed_counter += 1 outputs = network(X_relevant, X_irrelevant) y = y.reshape(-1, 1) loss = criterion(outputs, y) optimizer.zero_grad() loss.backward() optimizer.step() losses.append(loss.item()) if (epoch + 1) % 10 == 0: print( "Epoch: {:2d}/{:2d} ".format(epoch + 1, num_epochs), "Batch: {:2d} ".format(batch_processed_counter), "Batch loss: {:.6f} ".format(loss.item()) )
[docs]def create_test_combinations(top: pd.DataFrame, k: int = 50) -> tuple: """ Creates test combinations. Args: top (pd.DataFrame) k (int): Returns: X_relevant_test (): X_irrelevant_test (): """ X_relevant_test = pd.DataFrame() X_irrelevant_test = pd.DataFrame() for comb in list(combinations(range(k), 2)): X_relevant_test = pd.concat( [X_relevant_test, pd.DataFrame(top.iloc[comb[0]].to_dict(), index=[0])]).reset_index(drop=True) X_irrelevant_test = pd.concat( [X_irrelevant_test, pd.DataFrame(top.iloc[comb[1]].to_dict(), index=[0])]).reset_index(drop=True) return X_relevant_test, X_irrelevant_test
[docs]def bubble_sort(pairwise_results, documents) -> list: """ . Args: pairwise_results (list) documents (list): Returns: documents (list): """ swapped = True while swapped: swapped = False for i in range(len(documents) - 1): comp = pairwise_results[ ((pairwise_results['d1'] == documents[i]) & (pairwise_results['d2'] == documents[i + 1]))] if len(comp) > 0: if comp['predictions'].values[0] < 0.5: documents[i], documents[i + 1] = documents[i + 1], documents[i] swapped = True else: comp = pairwise_results[ ((pairwise_results['d1'] == documents[i + 1]) & (pairwise_results['d2'] == documents[i]))] if comp['predictions'].values[0] >= 0.5: documents[i], documents[i + 1] = documents[i + 1], documents[i] swapped = True return documents
[docs]def pairwise_optimize(model, results: pd.DataFrame, X, y, X_test, top_k: int = 50, train: bool = True) -> pd.DataFrame: """ . Args: model (): results (pd.DataFrame): X (): y (pd.DataFrame): X_test (): top_k (int): train (Boolean): Returns: results (pd.DataFrame): """ if train: train_pairwise(model, X, y) top = pd.concat([results, X_test], axis=1).sort_values('confidence', ascending=False).head(top_k) X_test = top.drop(columns=['confidence', 'qID', 'relevant']) top = results.sort_values('confidence', ascending=False).head(top_k) X_relevant_test, X_irrelevant_test = create_test_combinations(X_test, top_k) predictions = model(torch.tensor(X_relevant_test.drop(columns=['pID']).values).float(), torch.tensor(X_irrelevant_test.drop(columns=['pID']).values).float()) pairwise_results = pd.DataFrame({ 'predictions': predictions.reshape(-1).detach().numpy(), 'd1': X_relevant_test['pID'], 'd2': X_irrelevant_test['pID'] }) result = pd.DataFrame() for document in bubble_sort(pairwise_results, list(top['pID'])): result = pd.concat([result, top[top['pID'] == document]]) results.iloc[:top_k] = result return results