# Natural Language Processing ## Data Preprocessing ## Tokenization Convert a line of string into a list of tokens, where token is the basic unit in text. Its form can be either one character or a word or a pair 2 words. ## Vocabulary Map string tokens into numerical indices starting from 0 ```Python import collections import re from d2l import torch as d2l class Vocab: #@save """Vocabulary for text.""" def __init__(self, tokens=None, min_freq=0, reserved_tokens=None): ''' min_feq is the smallest frequency of a taken that will be added into the dictionary. ''' if tokens is None: tokens = [] if reserved_tokens is None: reserved_tokens = [] # Sort according to frequencies counter = count_corpus(tokens) self._token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True) # The index for the unknown token is 0 self.idx_to_token = [''] + reserved_tokens self.token_to_idx = { token: idx for idx, token in enumerate(self.idx_to_token)} for token, freq in self._token_freqs: if freq < min_freq: break if token not in self.token_to_idx: self.idx_to_token.append(token) self.token_to_idx[token] = len(self.idx_to_token) - 1 def __len__(self): return len(self.idx_to_token) def __getitem__(self, tokens): if not isinstance(tokens, (list, tuple)): return self.token_to_idx.get(tokens, self.unk) return [self.__getitem__(token) for token in tokens] def to_tokens(self, indices): if not isinstance(indices, (list, tuple)): return self.idx_to_token[indices] return [self.idx_to_token[index] for index in indices] @property def unk(self): # Index for the unknown token return 0 @property def token_freqs(self): # Index for the unknown token return self._token_freqs def count_corpus(tokens): #@save """Count token frequencies.""" # Here `tokens` is a 1D list or 2D list if len(tokens) == 0 or isinstance(tokens[0], list): # Flatten a list of token lists into a list of tokens tokens = [token for line in tokens for token in line] return collections.Counter(tokens) ``` Combining together > corpus is a big list storing the index of each token ```Python def load_corpus_time_machine(max_tokens=-1): """Return token indices and the vocabulary of the time machine dataset.""" lines = read_time_machine() tokens = tokenize(lines, 'char') vocab = Vocab(tokens) # Since each text line in the time machine dataset is not necessarily a # sentence or a paragraph, flatten all the text lines into a single list corpus = [vocab[token] for line in tokens for token in line] # corpus is a big list storing the index of each token if max_tokens > 0: corpus = corpus[:max_tokens] return corpus, vocab corpus, vocab = load_corpus_time_machine() len(corpus), len(vocab) ```