Conclusion: Sentence Piece is much better at tokenizing, retaining all information even when word is not in vocab.

Downloads: Jupyter Notebook:

This is the first post of a 3 part post to explore google’s Sentence Piece’s(SP) tokenizing power.

In a nut shell, this is the difference between SP’s tokenizing effort and traditional approach:

_ sentencepiece do-it-you-self
unkown words (e.g. poopbutt) UNKWON tokenizes to poop+butt
saving vocabulary and model DIY does it for you
multi-threading DIY does it for you
util functions sich as String2Index DIY does it for you
tokenizing speed depends on implementation surprisingly fast!

Once again, google has delivered a lovely lovely useful tool.

%load_ext autoreload
import os
import time
from fastai.text import *
import sys
sys.path.append('./imdb_scripts/')

from create_toks import *
import sentencepiece as spm
from fastai.text import *

define common variables such as path, file prefixes…

currvocab = 8
is_lowercase = True
curr_vocab_str = f"{currvocab}k"
if is_lowercase: 
    curr_vocab_str = curr_vocab_str + '_lowercase'
train_dir_str = 'data/aclImdb/train/all/'
all_name = 'train_all_lower.txt'
BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag
SPM_MODEL_PATH=Path(f'data/aclImdb_spm/{curr_vocab_str}/')
PATH=Path('data/aclImdb/')
CLAS_PATH=Path('data/imdb_clas/')
LM_PATH=Path('data/imdb_lm/')
CLAS_PATH_SPM=Path('data/imdb_clas_spm/')
LM_PATH_SPM=Path('data/imdb_lm_spm/')
chunksize=24000

First of all, all training and testing datasets are lowercased.

This step is ommited for its simplicity. (There’s literally 1000 ways to do it!)

full_p = train_dir_str + all_name
to_txt = 'train_all_lower.txt'
arg_str = '--input='+ train_dir_str + to_txt + f' --model_prefix=model_' + curr_vocab_str + f' --vocab_size={currvocab}000'
print(arg_str)
--input=data/aclImdb/train/all/train_all_lower.txt --model_prefix=model_8k_lowercase --vocab_size=8000

This is where google SentencePiece tokenizes the texts. (<5 min)

It can be called directly from commandline or inside python.

spm.SentencePieceTrainer.Train(arg_str)
---------------------------------------------------------------------------

KeyboardInterrupt                         Traceback (most recent call last)

<ipython-input-12-e295cb6211df> in <module>()
----> 1 spm.SentencePieceTrainer.Train(arg_str)


KeyboardInterrupt: 

modify get_texts and get_all for SentencePiece, to turn dataframe into tokens

google already threads it for you, so one less thing to worry about!

def get_texts_spm(spm_model, df, n_lbls):
    tstart = time.time()
    if len(df.columns) == 1:
        labels = []
        texts = f'\n{BOS} {FLD} 1 ' + df[0].astype(str)
        texts = texts.apply(fixup).values.astype(str)
    else:
        labels = df.iloc[:,range(n_lbls)].values.astype(np.int64)
        texts = f'\n{BOS} {FLD} 1 ' + df[n_lbls].astype(str)
        for i in range(n_lbls+1, len(df.columns)): texts += f' {FLD} {i-n_lbls} ' + df[i].astype(str)
        texts = texts.apply(fixup).values.astype(str)

    #tok = proc_all_mp_smp(partition_by_cores(texts))
    tok = [spm_model.EncodeAsIds(t) for t in texts]
    tend = time.time()
    print(f'{(tend-tstart)/(len(texts)/1000):.2f}sec per 1k rows')
    return tok, list(labels)
def get_all_spm(spm_model, df, n_lbls):
    tok, labels = [], []
    for i, r in enumerate(df):
        
        print(i)
        tok_, labels_ = get_texts_spm(spm_model, r, n_lbls)
        tok += tok_;
        labels += labels_
        
    return tok, labels
sp8_lower = spm.SentencePieceProcessor()
sp8_lower.Load('data/imdb_lm_spm/model_8k_lowercase.model')
True
print(sp8_lower.EncodeAsPieces("I wish I knew what the FUCKingHELL is up with the thingy".lower()))
['▁i', '▁wish', '▁i', '▁knew', '▁what', '▁the', '▁fu', 'ck', 'ing', 'hell', '▁is', '▁up', '▁with', '▁the', '▁thing', 'y']
print(sp8_lower.EncodeAsPieces("Shittingduckcrappoopercrackingjack".lower()))
['▁shi', 't', 'ting', 'd', 'uck', 'c', 'ra', 'pp', 'oo', 'per', 'crack', 'ing', 'jack']
print(sp8_lower.EncodeAsPieces("reaaaaaaaaaaaaaaaaaally".lower()))
['▁re', 'a', 'aaaa', 'aaaa', 'aaaa', 'aaaa', 'ally']
print(sp8_lower.EncodeAsPieces('ElectricDildoInYourButt'.lower()))
['▁electric', 'd', 'il', 'do', 'in', 'y', 'our', 'but', 't']
print(sp8_lower.EncodeAsPieces('There is value to simplicity\nWhich offers more explicitly\nSoundings more exquisitely\nWhen words are not too long\n'.lower()))
['▁there', '▁is', '▁value', '▁to', '▁simplicity', '\n', 'which', '▁offers', '▁more', '▁explicit', 'ly', '\n', 'sound', 'ing', 's', '▁more', '▁exquisite', 'ly', '\n', 'when', '▁words', '▁are', '▁not', '▁too', '▁long', '\n']
df_trn = pd.read_csv(LM_PATH/'train_lower.csv', header=None, chunksize=chunksize)
df_val = pd.read_csv(LM_PATH/'test_lower.csv', header=None, chunksize=chunksize)
#get_all_spm(sp32_lower, df_trn, 1)
tok_trn_spm, trn_labels_spm = get_all_spm(sp8_lower, df_trn, 1)
tok_val_spm, val_labels_spm = get_all_spm(sp8_lower, df_val, 1)
0
1.89sec per 1k rows
1
2.34sec per 1k rows
2
2.30sec per 1k rows
3
2.14sec per 1k rows
0
1.95sec per 1k rows
len(tok_trn_spm), len(tok_val_spm)
(90000, 10000)
(CLAS_PATH/'tmp').mkdir(exist_ok=True)

np.save(LM_PATH/'tmp'/'tok_trn_spm8_lower_ids.npy', tok_trn_spm)
np.save(LM_PATH/'tmp'/'tok_val_spm8_lower_ids.npy', tok_val_spm)

now tokenize for classification

df_trn = pd.read_csv(CLAS_PATH/'train_lower.csv', header=None, chunksize=chunksize)
df_val = pd.read_csv(CLAS_PATH/'test_lower.csv', header=None, chunksize=chunksize)
tok_trn_spm, trn_labels_spm = get_all_spm(sp8_lower, df_trn, 1)
tok_val_spm, val_labels_spm = get_all_spm(sp8_lower, df_val, 1)
0
1.68sec per 1k rows
1
1.63sec per 1k rows
0
1.72sec per 1k rows
1
1.80sec per 1k rows
len(tok_trn_spm), len(tok_val_spm)
(25000, 25000)
np.save(CLAS_PATH/'tmp'/'tok_trn_spm8_lower_ids.npy', tok_trn_spm)
np.save(CLAS_PATH/'tmp'/'tok_val_spm8_lower_ids.npy', tok_val_spm)


# labels are no good. ignore. 
np.save(CLAS_PATH/'tmp'/'trn_labels_lower_spm8.npy', trn_labels_spm)
np.save(CLAS_PATH/'tmp'/'val_labels_lower_spm8.npy', val_labels_spm)