Args: input_size (int): embedding_dim or embedding_dim + hidden_size hidden_size (int): hidden size kernel_sizes (list[int]): the size of kernels encoder_layers (int): number of conv layers dropout (float): dropout ratio """ super().__init__()
self.encoders = nn.ModuleList( [ Conv1d( in_channels=input_size if i == 0 else hidden_size, out_channels=hidden_size, kernel_sizes=kernel_sizes, ) for i in range(encoder_layers) ] )
for i, encoder in enumerate(self.encoders): # fills elements of x with 0.0 where mask is False x.masked_fill_(~mask, 0.0) # using dropout if i > 0: x = self.dropout(x) # returned x (batch_size, hidden_size, seq_len) x = encoder(x)
Args: x (Tensor): the output of pre block (batch_size, seq_len, hidden_size) res (Tensor): (batch_size, seq_len, embedding_size) or (batch_size, seq_len, embedding_size + hidden_size) res[:,:,hidden_size:] is the output of Embedding layer res[:,:,:hidden_size] is the output of previous two block i (int): layer index
Returns: Tensor: (batch_size, seq_len, hidden_size + embedding_size) """ if i == 1: # (batch_size, seq_len, hidden_size + embedding_size) return torch.cat([x, res], dim=-1) hidden_size = x.size(-1) # (res[:, :, :hidden_size] + x) is the summation of the output of previous two blocks # x (batch_size, seq_len, hidden_size) x = (res[:, :, :hidden_size] + x) * math.sqrt(0.5) # (batch_size, seq_len, hidden_size + embedding_size) return torch.cat([x, res[:, :, hidden_size:]], dim=-1)
Returns: Tensor: (batch_size, hidden_size) """ # max returns a namedtuple (values, indices), we only need values return x.masked_fill(~mask, -float("inf")).max(dim=1)[0]
Returns: Tensor: (batch_size, num_classes) """ # a (batch_size, seq_len, embedding_dim) a = self.embedding(a) # b (batch_size, seq_len, embedding_dim) b = self.embedding(b)
res_a, res_b = a, b
for i, block in enumerate(self.blocks): if i > 0: # a (batch_size, seq_len, embedding_dim + hidden_size) a = self.connection(a, res_a, i) # b (batch_size, seq_len, embedding_dim + hidden_size) b = self.connection(b, res_b, i) # now embeddings saved to res_a[:,:,hidden_size:] res_a, res_b = a, b # a_enc (batch_size, seq_len, hidden_size) a_enc = block["encoder"](a, mask_a) # b_enc (batch_size, seq_len, hidden_size) b_enc = block["encoder"](b, mask_b) # concating the input and output of encoder # a (batch_size, seq_len, embedding_dim + hidden_size or embedding_dim + hidden_size * 2) a = torch.cat([a, a_enc], dim=-1) # b (batch_size, seq_len, embedding_dim + hidden_size or embedding_dim + hidden_size * 2) b = torch.cat([b, b_enc], dim=-1) # align_a (batch_size, seq_len, embedding_dim + hidden_size or embedding_dim + hidden_size * 2) # align_b (batch_size, seq_len, embedding_dim + hidden_size or embedding_dim + hidden_size * 2) align_a, align_b = block["alignment"](a, b, mask_a, mask_b) # a (batch_size, seq_len, hidden_size) a = block["fusion"](a, align_a) # b (batch_size, seq_len, hidden_size) b = block["fusion"](b, align_b) # a (batch_size, hidden_size) a = self.pooling(a, mask_a) # b (batch_size, hidden_size) b = self.pooling(b, mask_b) # (batch_size, num_classes) return self.prediction(a, b)
注意不同块之间输入维度的区别。
unset
unset
数据准备
unset
unset
在→文章←中数据准备这部分内容有详细的解释。
from collections import defaultdict from tqdm import tqdm import numpy as np import json from torch.utils.data import Dataset import pandas as pd from typing import Tuple
UNK_TOKEN = "" PAD_TOKEN = ""
class Vocabulary: """Class to process text and extract vocabulary for mapping"""
def __init__(self, token_to_idx: dict = None, tokens: list[str] = None) -> None: """ Args: token_to_idx (dict, optional): a pre-existing map of tokens to indices. Defaults to None. tokens (list[str], optional): a list of unique tokens with no duplicates. Defaults to None. """
assert any( [tokens, token_to_idx] ), "At least one of these parameters should be set as not None." if token_to_idx: self._token_to_idx = token_to_idx else: self._token_to_idx = {} if PAD_TOKEN not in tokens: tokens = [PAD_TOKEN] + tokens
for idx, token in enumerate(tokens): self._token_to_idx[token] = idx
self._idx_to_token = {idx: token for token, idx in self._token_to_idx.items()}
@classmethod def build( cls, sentences: list[list[str]], min_freq: int = 2, reserved_tokens: list[str] = None, ) -> "Vocabulary": """Construct the Vocabulary from sentences
Args: sentences (list[list[str]]): a list of tokenized sequences min_freq (int, optional): the minimum word frequency to be saved. Defaults to 2. reserved_tokens (list[str], optional): the reserved tokens to add into the Vocabulary. Defaults to None.
Returns: Vocabulary: a Vocubulary instane """
token_freqs = defaultdict(int) for sentence in tqdm(sentences): for token in sentence: token_freqs[token] += 1
unique_tokens = (reserved_tokens if reserved_tokens else []) + [UNK_TOKEN] unique_tokens += [ token for token, freq in token_freqs.items() if freq >= min_freq and token != UNK_TOKEN ] return cls(tokens=unique_tokens)
def __getitem__(self, tokens: list[str] | str) -> list[int] | int: """Retrieve the indices associated with the tokens or the index with the single token
Args: tokens (list[str] | str): a list of tokens or single token
Returns: list[int] | int: the indices or the single index """ if