コンテンツにスキップ

トークナイザーの詳細

はじめに:言語の原子

コンパイラの字句解析器(レキサー)を作ったことを思い出してください。ソースコードを意味のある最小単位(トークン)に分割する作業です。if (x > 10) { return true; }IF, LPAREN, IDENT(x), GT, NUMBER(10), ... に分解します。

自然言語のトークナイザーも同じ役割を果たしますが、プログラミング言語と違って明確な規則がありません。単語?文字?それとも何か別の単位?この章では、現代のトークナイザーがどのように言語を「原子」に分解し、なぜそれが重要なのかを探ります。

19.1 トークン化の基礎

なぜトークン化が重要か

```python import numpy as np import matplotlib.pyplot as plt import seaborn as sns from typing import List, Dict, Tuple, Optional, Set from collections import Counter, defaultdict import regex as re # Unicodeサポートが優れている import json from dataclasses import dataclass import heapq from tqdm import tqdm

class TokenizationBasics: """トークン化の基礎概念"""

def explain_tokenization_challenges(self):
    """トークン化の課題を説明"""
    print("=== トークン化の課題 ===\n")

    # 様々な言語での例
    examples = {
        "英語": {
            "text": "I don't think it's working.",
            "word_tokens": ["I", "don't", "think", "it's", "working", "."],
            "char_tokens": list("I don't think it's working."),
            "challenges": "縮約形(don't, it's)の扱い"
        },

        "日本語": {
            "text": "私は猫が好きです。",
            "word_tokens": ["私", "は", "猫", "が", "好き", "です", "。"],
            "char_tokens": list("私は猫が好きです。"),
            "challenges": "単語境界が不明確"
        },

        "ドイツ語": {
            "text": "Donaudampfschifffahrtsgesellschaft",
            "word_tokens": ["Donaudampfschifffahrtsgesellschaft"],
            "char_tokens": list("Donaudampfschifffahrtsgesellschaft"),
            "challenges": "複合語が非常に長い"
        },

        "中国語": {
            "text": "我喜欢吃苹果",
            "word_tokens": ["我", "喜欢", "吃", "苹果"],
            "char_tokens": list("我喜欢吃苹果"),
            "challenges": "スペースがない"
        }
    }

    for lang, info in examples.items():
        print(f"{lang}:")
        print(f"  テキスト: {info['text']}")
        print(f"  単語トークン: {info['word_tokens'][:5]}...")
        print(f"  文字トークン: {info['char_tokens'][:10]}...")
        print(f"  課題: {info['challenges']}\n")

    # トークン化手法の比較
    self._compare_tokenization_methods()

def _compare_tokenization_methods(self):
    """トークン化手法の比較"""
    print("=== トークン化手法の比較 ===\n")

    methods = {
        "Word-level": {
            "vocab_size": "50,000-200,000",
            "OOV_handling": "Poor",
            "morphology": "No",
            "efficiency": "Low"
        },
        "Character-level": {
            "vocab_size": "100-1,000",
            "OOV_handling": "Perfect",
            "morphology": "Implicit",
            "efficiency": "Very Low"
        },
        "Subword (BPE/WordPiece)": {
            "vocab_size": "10,000-100,000",
            "OOV_handling": "Good",
            "morphology": "Partial",
            "efficiency": "High"
        },
        "SentencePiece": {
            "vocab_size": "10,000-100,000",
            "OOV_handling": "Good",
            "morphology": "Partial",
            "efficiency": "High"
        }
    }

    # 表形式で表示
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.axis('tight')
    ax.axis('off')

    # ヘッダー
    headers = ["Method", "Vocab Size", "OOV Handling", "Morphology", "Efficiency"]
    cell_data = []

    for method, props in methods.items():
        row = [method] + list(props.values())
        cell_data.append(row)

    # テーブル作成
    table = ax.table(cellText=cell_data, colLabels=headers,
                    cellLoc='center', loc='center',
                    colWidths=[0.25, 0.2, 0.15, 0.15, 0.15])

    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1, 2)

    # スタイリング
    for i in range(len(headers)):
        table[(0, i)].set_facecolor('#4CAF50')
        table[(0, i)].set_text_props(weight='bold', color='white')

    # 色分け
    colors = ['#ffebee', '#e8f5e9', '#fff3e0', '#e3f2fd']
    for i, color in enumerate(colors):
        for j in range(len(headers)):
            table[(i+1, j)].set_facecolor(color)

    plt.title('Comparison of Tokenization Methods', fontsize=14, weight='bold', pad=20)
    plt.show()

19.2 Byte Pair Encoding (BPE)

class BytePairEncoding: """BPEトークナイザーの実装"""

def __init__(self):
    self.vocab = {}
    self.merges = []

def train(self, texts: List[str], vocab_size: int = 1000):
    """BPEの学習"""
    print("=== BPE学習プロセス ===\n")

    # 初期語彙(文字レベル)
    word_freqs = defaultdict(int)
    for text in texts:
        words = text.split()
        for word in words:
            word_freqs[' '.join(list(word) + ['</w>'])] += 1

    # 初期語彙を作成
    self.vocab = {}
    for word, freq in word_freqs.items():
        for char in word.split():
            if char not in self.vocab:
                self.vocab[char] = len(self.vocab)

    print(f"初期語彙サイズ: {len(self.vocab)}")
    print(f"初期語彙例: {list(self.vocab.keys())[:10]}\n")

    # マージ操作
    num_merges = vocab_size - len(self.vocab)

    for i in tqdm(range(num_merges), desc="Learning merges"):
        # ペアの頻度を計算
        pair_freqs = self._get_pair_frequencies(word_freqs)

        if not pair_freqs:
            break

        # 最頻出ペアを選択
        best_pair = max(pair_freqs, key=pair_freqs.get)
        self.merges.append(best_pair)

        # 語彙を更新
        new_token = ''.join(best_pair)
        self.vocab[new_token] = len(self.vocab)

        # コーパスを更新
        word_freqs = self._merge_pair(word_freqs, best_pair)

        # 進捗表示
        if (i + 1) % 100 == 0:
            print(f"マージ {i+1}: {best_pair} → {new_token}")

    print(f"\n最終語彙サイズ: {len(self.vocab)}")

    # 学習結果の可視化
    self._visualize_vocabulary()

def _get_pair_frequencies(self, word_freqs: Dict[str, int]) -> Dict[Tuple[str, str], int]:
    """ペアの頻度を計算"""
    pair_freqs = defaultdict(int)

    for word, freq in word_freqs.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pair_freqs[(symbols[i], symbols[i + 1])] += freq

    return pair_freqs

def _merge_pair(self, word_freqs: Dict[str, int], 
                pair: Tuple[str, str]) -> Dict[str, int]:
    """ペアをマージ"""
    new_word_freqs = {}
    bigram = ' '.join(pair)
    replacement = ''.join(pair)

    for word, freq in word_freqs.items():
        new_word = word.replace(bigram, replacement)
        new_word_freqs[new_word] = freq

    return new_word_freqs

def encode(self, text: str) -> List[int]:
    """テキストをエンコード"""
    words = text.split()
    tokens = []

    for word in words:
        # 単語を文字に分割
        word_tokens = list(word) + ['</w>']

        # マージを適用
        for merge in self.merges:
            i = 0
            while i < len(word_tokens) - 1:
                if (word_tokens[i], word_tokens[i + 1]) == merge:
                    word_tokens = word_tokens[:i] + [''.join(merge)] + word_tokens[i + 2:]
                else:
                    i += 1

        # トークンIDに変換
        for token in word_tokens:
            if token in self.vocab:
                tokens.append(self.vocab[token])
            else:
                # Unknown token
                tokens.append(self.vocab.get('<unk>', 0))

    return tokens

def decode(self, token_ids: List[int]) -> str:
    """トークンIDをデコード"""
    # 逆引き辞書
    id_to_token = {v: k for k, v in self.vocab.items()}

    tokens = [id_to_token.get(id, '<unk>') for id in token_ids]
    text = ' '.join(tokens).replace('</w> ', ' ').replace('</w>', '')

    return text

def _visualize_vocabulary(self):
    """語彙の可視化"""
    # トークン長の分布
    token_lengths = [len(token.replace('</w>', '')) for token in self.vocab.keys()]

    plt.figure(figsize=(10, 6))
    plt.hist(token_lengths, bins=range(1, max(token_lengths) + 2), 
            alpha=0.7, edgecolor='black')
    plt.xlabel('Token Length (characters)')
    plt.ylabel('Count')
    plt.title('Distribution of Token Lengths in BPE Vocabulary')
    plt.grid(True, alpha=0.3)

    # 統計情報
    avg_length = np.mean(token_lengths)
    plt.axvline(avg_length, color='red', linestyle='--', 
               label=f'Average: {avg_length:.2f}')
    plt.legend()

    plt.tight_layout()
    plt.show()

class BPEDemo: """BPEのデモンストレーション"""

def demonstrate_bpe_process(self):
    """BPEプロセスのデモ"""
    print("=== BPEプロセスの可視化 ===\n")

    # サンプルテキスト
    corpus = [
        "the cat sat on the mat",
        "the dog sat on the log",
        "cats and dogs are pets"
    ]

    # 初期状態
    words = []
    for text in corpus:
        words.extend(text.split())

    # 単語を文字に分割
    word_splits = {}
    for word in set(words):
        word_splits[word] = list(word) + ['</w>']

    print("初期状態(文字分割):")
    for word, splits in list(word_splits.items())[:5]:
        print(f"  {word}: {' '.join(splits)}")

    # マージプロセスのシミュレーション
    merges = [
        ('t', 'h'),      # th
        ('th', 'e'),     # the
        ('a', 't'),      # at
        ('s', 'at'),     # sat
        ('o', 'n'),      # on
    ]

    print("\n\nマージプロセス:")
    current_splits = word_splits.copy()

    for i, (a, b) in enumerate(merges):
        print(f"\nステップ {i+1}: '{a}' + '{b}' → '{a+b}'")

        # マージを適用
        for word, splits in current_splits.items():
            new_splits = []
            j = 0
            while j < len(splits):
                if j < len(splits) - 1 and splits[j] == a and splits[j+1] == b:
                    new_splits.append(a + b)
                    j += 2
                else:
                    new_splits.append(splits[j])
                    j += 1
            current_splits[word] = new_splits

        # 変更された単語を表示
        for word, splits in list(current_splits.items())[:3]:
            print(f"    {word}: {' '.join(splits)}")

    # 最終的なトークン化
    self._visualize_tokenization_result(current_splits)

def _visualize_tokenization_result(self, word_splits: Dict[str, List[str]]):
    """トークン化結果の可視化"""
    fig, ax = plt.subplots(figsize=(12, 6))

    # サンプル文
    sentence = "the cat sat on the mat"
    words = sentence.split()

    y_pos = 0.5
    x_pos = 0

    colors = plt.cm.Set3(np.linspace(0, 1, 20))
    color_idx = 0

    for word in words:
        tokens = word_splits.get(word, list(word) + ['</w>'])

        for token in tokens:
            if token == '</w>':
                # 単語境界マーカー
                width = 0.3
                rect = plt.Rectangle((x_pos, y_pos), width, 0.3,
                                   facecolor='lightgray', 
                                   edgecolor='black', linewidth=1)
                ax.add_patch(rect)
                ax.text(x_pos + width/2, y_pos + 0.15, '</w>',
                       ha='center', va='center', fontsize=8)
            else:
                # 通常のトークン
                width = len(token) * 0.15
                rect = plt.Rectangle((x_pos, y_pos), width, 0.3,
                                   facecolor=colors[color_idx % len(colors)],
                                   edgecolor='black', linewidth=1)
                ax.add_patch(rect)
                ax.text(x_pos + width/2, y_pos + 0.15, token,
                       ha='center', va='center', fontsize=10)
                color_idx += 1

            x_pos += width + 0.05

        x_pos += 0.2  # 単語間のスペース

    ax.set_xlim(-0.1, x_pos)
    ax.set_ylim(0, 1)
    ax.axis('off')
    ax.set_title('BPE Tokenization Result', fontsize=14, weight='bold')

    # 元の文を表示
    ax.text(x_pos/2, 0.9, f'Original: "{sentence}"', 
           ha='center', va='center', fontsize=12, style='italic')

    plt.tight_layout()
    plt.show()

19.3 WordPieceとSentencePiece

class WordPieceTokenizer: """WordPieceトークナイザーの実装"""

def __init__(self):
    self.vocab = {}
    self.unk_token = '[UNK]'
    self.max_input_chars_per_word = 100

def train(self, texts: List[str], vocab_size: int = 1000):
    """WordPieceの学習(簡略版)"""
    print("=== WordPiece学習 ===\n")

    # 初期語彙の構築
    char_counts = Counter()
    word_counts = Counter()

    for text in texts:
        words = text.lower().split()
        for word in words:
            word_counts[word] += 1
            for char in word:
                char_counts[char] += 1

    # 基本語彙
    self.vocab = {
        '[PAD]': 0,
        '[UNK]': 1,
        '[CLS]': 2,
        '[SEP]': 3,
        '[MASK]': 4
    }

    # 文字を追加
    for char, count in char_counts.most_common():
        if len(self.vocab) < 100:  # 最初の100は文字用
            self.vocab[char] = len(self.vocab)

    # WordPieceの追加
    print(f"初期語彙サイズ: {len(self.vocab)}")

    # サブワード候補の生成と評価
    while len(self.vocab) < vocab_size:
        # 候補を生成
        candidates = self._generate_candidates(word_counts)

        if not candidates:
            break

        # スコアを計算(簡略版)
        best_candidate = max(candidates, 
                           key=lambda x: self._score_candidate(x, word_counts))

        # 語彙に追加
        if best_candidate.startswith('##'):
            self.vocab[best_candidate] = len(self.vocab)
        else:
            self.vocab['##' + best_candidate] = len(self.vocab)

        # 進捗
        if len(self.vocab) % 100 == 0:
            print(f"語彙サイズ: {len(self.vocab)}")

    print(f"\n最終語彙サイズ: {len(self.vocab)}")

    # WordPieceの特徴を可視化
    self._visualize_wordpiece_features()

def _generate_candidates(self, word_counts: Counter) -> List[str]:
    """サブワード候補を生成"""
    candidates = set()

    for word in word_counts:
        for i in range(len(word)):
            for j in range(i + 1, min(i + 10, len(word) + 1)):
                subword = word[i:j]
                if len(subword) > 1:
                    candidates.add(subword)

    return list(candidates)

def _score_candidate(self, candidate: str, word_counts: Counter) -> float:
    """候補のスコアを計算"""
    score = 0
    for word, count in word_counts.items():
        if candidate in word:
            score += count
    return score

def tokenize(self, text: str) -> List[str]:
    """テキストをトークン化"""
    output_tokens = []

    for word in text.lower().split():
        if len(word) > self.max_input_chars_per_word:
            output_tokens.append(self.unk_token)
            continue

        is_bad = False
        sub_tokens = []
        start = 0

        while start < len(word):
            end = len(word)
            cur_substr = None

            while start < end:
                substr = word[start:end]
                if start > 0:
                    substr = '##' + substr

                if substr in self.vocab:
                    cur_substr = substr
                    break

                end -= 1

            if cur_substr is None:
                is_bad = True
                break

            sub_tokens.append(cur_substr)
            start = end

        if is_bad:
            output_tokens.append(self.unk_token)
        else:
            output_tokens.extend(sub_tokens)

    return output_tokens

def _visualize_wordpiece_features(self):
    """WordPieceの特徴を可視化"""
    # ##プレフィックスの統計
    prefix_tokens = [token for token in self.vocab.keys() 
                    if token.startswith('##')]

    print(f"\n##プレフィックス付きトークン: {len(prefix_tokens)}")
    print(f"例: {prefix_tokens[:10]}")

    # トークン化の例
    examples = [
        "playing",
        "unbelievable",
        "internationalization"
    ]

    print("\n\nトークン化の例:")
    for word in examples:
        tokens = self.tokenize(word)
        print(f"  {word} → {tokens}")

class SentencePieceDemo: """SentencePieceのデモ"""

def explain_sentencepiece(self):
    """SentencePieceの説明"""
    print("=== SentencePiece ===\n")

    print("特徴:")
    print("1. 言語独立:")
    print("   - 前処理不要(トークン化なし)")
    print("   - 生のテキストから直接学習")
    print("   - スペースも通常の文字として扱う\n")

    print("2. 可逆的:")
    print("   - デトークン化で元のテキストを完全復元")
    print("   - 情報の損失なし\n")

    print("3. サブワードの正規化:")
    print("   - 確率的サンプリング")
    print("   - 複数の分割候補から選択\n")

    # アルゴリズムの比較
    self._compare_subword_algorithms()

def _compare_subword_algorithms(self):
    """サブワードアルゴリズムの比較"""
    fig, axes = plt.subplots(1, 3, figsize=(15, 6))

    # BPE
    ax = axes[0]
    ax.set_title('BPE', fontsize=12, weight='bold')

    # BPEのマージプロセス
    bpe_steps = [
        "c a t s </w>",
        "ca t s </w>",
        "ca ts </w>",
        "cats </w>"
    ]

    for i, step in enumerate(bpe_steps):
        y = 0.8 - i * 0.2
        ax.text(0.5, y, step, ha='center', va='center',
               fontsize=10, family='monospace',
               bbox=dict(boxstyle="round,pad=0.3", 
                       facecolor='lightblue', alpha=0.7))

        if i < len(bpe_steps) - 1:
            ax.arrow(0.5, y - 0.05, 0, -0.1, 
                    head_width=0.05, head_length=0.02,
                    fc='black', ec='black')

    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.axis('off')
    ax.text(0.5, 0.95, 'Bottom-up Merging', ha='center', 
           fontsize=10, style='italic')

    # WordPiece
    ax = axes[1]
    ax.set_title('WordPiece', fontsize=12, weight='bold')

    # WordPieceの分割
    word = "playing"
    tokens = ["play", "##ing"]

    # 単語全体
    rect = plt.Rectangle((0.2, 0.5), 0.6, 0.2,
                       facecolor='lightgreen', edgecolor='black')
    ax.add_patch(rect)
    ax.text(0.5, 0.6, word, ha='center', va='center', fontsize=12)

    # 分割後
    x_pos = 0.2
    for token in tokens:
        width = 0.25
        rect = plt.Rectangle((x_pos, 0.2), width, 0.15,
                           facecolor='lightyellow', edgecolor='black')
        ax.add_patch(rect)
        ax.text(x_pos + width/2, 0.275, token, 
               ha='center', va='center', fontsize=10)
        x_pos += width + 0.1

    # 矢印
    ax.arrow(0.5, 0.48, 0, -0.1, head_width=0.05, head_length=0.02,
            fc='black', ec='black')

    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.axis('off')
    ax.text(0.5, 0.95, 'Likelihood Maximization', ha='center',
           fontsize=10, style='italic')

    # SentencePiece
    ax = axes[2]
    ax.set_title('SentencePiece', fontsize=12, weight='bold')

    # 生テキスト
    text = "▁the▁cat"
    ax.text(0.5, 0.7, text, ha='center', va='center',
           fontsize=12, family='monospace',
           bbox=dict(boxstyle="round,pad=0.3", 
                   facecolor='lightcoral', alpha=0.7))

    # 複数の分割候補
    candidates = [
        ["▁the", "▁cat"],
        ["▁th", "e", "▁cat"],
        ["▁the", "▁c", "at"]
    ]

    y_start = 0.4
    for i, cand in enumerate(candidates):
        y = y_start - i * 0.15
        text = ' '.join(cand)
        ax.text(0.5, y, text, ha='center', va='center',
               fontsize=9, family='monospace',
               bbox=dict(boxstyle="round,pad=0.2",
                       facecolor='lightyellow', alpha=0.5))

    ax.text(0.8, 0.25, 'Sample', ha='center', fontsize=8,
           style='italic', color='red')

    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.axis('off')
    ax.text(0.5, 0.95, 'Unigram LM / BPE', ha='center',
           fontsize=10, style='italic')

    plt.tight_layout()
    plt.show()

19.4 現代的なトークナイザー

class ModernTokenizers: """現代的なトークナイザーの実装と比較"""

def compare_modern_tokenizers(self):
    """現代的トークナイザーの比較"""
    print("=== 現代的なトークナイザー ===\n")

    tokenizers = {
        "GPT-2/GPT-3": {
            "type": "BPE",
            "vocab_size": "50,257",
            "special": "Byte-level BPE",
            "features": "スペース処理の改善"
        },

        "BERT": {
            "type": "WordPiece",
            "vocab_size": "30,522",
            "special": "##プレフィックス",
            "features": "事前トークン化必要"
        },

        "T5/mT5": {
            "type": "SentencePiece",
            "vocab_size": "32,000",
            "special": "▁(スペースマーカー)",
            "features": "言語独立"
        },

        "LLaMA": {
            "type": "SentencePiece (BPE)",
            "vocab_size": "32,000",
            "special": "Byte fallback",
            "features": "未知文字の処理"
        },

        "ChatGPT": {
            "type": "cl100k_base (tiktoken)",
            "vocab_size": "100,277",
            "special": "改良されたBPE",
            "features": "効率的なエンコーディング"
        }
    }

    # 比較表示
    self._visualize_tokenizer_comparison(tokenizers)

    # エンコーディング効率の比較
    self._compare_encoding_efficiency()

def _visualize_tokenizer_comparison(self, tokenizers: Dict[str, Dict[str, str]]):
    """トークナイザーの比較を可視化"""
    # 語彙サイズの比較
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

    # 語彙サイズ
    names = list(tokenizers.keys())
    vocab_sizes = []
    for name, info in tokenizers.items():
        size_str = info["vocab_size"].replace(",", "")
        vocab_sizes.append(int(size_str))

    colors = plt.cm.viridis(np.linspace(0, 1, len(names)))
    bars = ax1.bar(names, vocab_sizes, color=colors)

    # 値を表示
    for bar, size in zip(bars, vocab_sizes):
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height,
                f'{size:,}', ha='center', va='bottom')

    ax1.set_ylabel('Vocabulary Size')
    ax1.set_title('Vocabulary Sizes of Modern Tokenizers')
    ax1.tick_params(axis='x', rotation=45)

    # タイプ別分類
    type_counts = Counter(info["type"].split()[0] for info in tokenizers.values())

    ax2.pie(type_counts.values(), labels=type_counts.keys(),
           autopct='%1.1f%%', startangle=90)
    ax2.set_title('Distribution of Tokenizer Types')

    plt.tight_layout()
    plt.show()

def _compare_encoding_efficiency(self):
    """エンコーディング効率の比較"""
    print("\n=== エンコーディング効率の比較 ===")

    # サンプルテキスト
    samples = {
        "English": "The quick brown fox jumps over the lazy dog.",
        "Code": "def factorial(n): return 1 if n <= 1 else n * factorial(n-1)",
        "Mixed": "Hello世界! 🌍 This is a test → λx.x+1",
        "URL": "https://github.com/openai/gpt-3/blob/main/model.py"
    }

    # 仮想的なトークン数(実際の比率に基づく)
    tokenizer_efficiency = {
        "GPT-2": {"English": 11, "Code": 24, "Mixed": 18, "URL": 35},
        "BERT": {"English": 12, "Code": 28, "Mixed": 22, "URL": 40},
        "T5": {"English": 10, "Code": 25, "Mixed": 16, "URL": 38},
        "ChatGPT": {"English": 9, "Code": 20, "Mixed": 14, "URL": 25}
    }

    # ヒートマップで表示
    fig, ax = plt.subplots(figsize=(10, 6))

    tokenizers = list(tokenizer_efficiency.keys())
    text_types = list(samples.keys())

    efficiency_matrix = np.array([
        [tokenizer_efficiency[tok][txt] for txt in text_types]
        for tok in tokenizers
    ])

    im = ax.imshow(efficiency_matrix, cmap='RdYlGn_r', aspect='auto')

    # ラベル
    ax.set_xticks(np.arange(len(text_types)))
    ax.set_yticks(np.arange(len(tokenizers)))
    ax.set_xticklabels(text_types)
    ax.set_yticklabels(tokenizers)

    # 値を表示
    for i in range(len(tokenizers)):
        for j in range(len(text_types)):
            text = ax.text(j, i, efficiency_matrix[i, j],
                         ha="center", va="center", color="black")

    ax.set_title('Token Count Comparison (Lower is Better)')
    plt.colorbar(im, ax=ax, label='Number of Tokens')

    plt.tight_layout()
    plt.show()

class TokenizerImplementationTips: """トークナイザー実装のヒント"""

def share_best_practices(self):
    """ベストプラクティスの共有"""
    print("=== トークナイザー実装のベストプラクティス ===\n")

    practices = {
        "1. 前処理": [
            "Unicode正規化(NFKC)",
            "空白文字の統一",
            "特殊文字のエスケープ",
            "大文字小文字の扱いを決定"
        ],

        "2. 特殊トークン": [
            "[PAD], [UNK], [CLS], [SEP]の追加",
            "タスク固有トークンの設計",
            "予約領域の確保",
            "トークンIDの固定化"
        ],

        "3. 効率化": [
            "トライ木での高速検索",
            "キャッシュの活用",
            "バッチ処理の実装",
            "並列化可能な設計"
        ],

        "4. 堅牢性": [
            "未知文字の適切な処理",
            "最大長の制限",
            "エラーハンドリング",
            "デバッグ情報の出力"
        ]
    }

    for category, items in practices.items():
        print(f"{category}:")
        for item in items:
            print(f"  • {item}")
        print()

    # 実装例
    self._show_implementation_example()

def _show_implementation_example(self):
    """実装例を表示"""
    print("\n=== 効率的なトークナイザーの実装例 ===\n")

    code = '''

class EfficientTokenizer: """効率的なトークナイザーの実装"""

def __init__(self, vocab: Dict[str, int]):
    self.vocab = vocab
    self.trie = self._build_trie(vocab)
    self.cache = {}

def _build_trie(self, vocab: Dict[str, int]) -> Dict:
    """トライ木の構築"""
    trie = {}
    for token, token_id in vocab.items():
        node = trie
        for char in token:
            if char not in node:
                node[char] = {}
            node = node[char]
        node['<END>'] = token_id
    return trie

def encode(self, text: str) -> List[int]:
    """高速エンコード"""
    # キャッシュチェック
    if text in self.cache:
        return self.cache[text]

    tokens = []
    i = 0

    while i < len(text):
        # 最長一致
        node = self.trie
        longest_token_id = None
        longest_end = i

        for j in range(i, len(text)):
            if text[j] not in node:
                break
            node = node[text[j]]
            if '<END>' in node:
                longest_token_id = node['<END>']
                longest_end = j + 1

        if longest_token_id is not None:
            tokens.append(longest_token_id)
            i = longest_end
        else:
            # Unknown token
            tokens.append(self.vocab.get('<UNK>', 0))
            i += 1

    # キャッシュに保存
    self.cache[text] = tokens
    return tokens

'''

    print(code)

実行とデモ

def run_tokenizer_demo(): """トークナイザーのデモを実行""" print("=" * 70) print("トークナイザーの詳細") print("=" * 70 + "\n")

# 1. 基礎概念
basics = TokenizationBasics()
basics.explain_tokenization_challenges()

# 2. BPE
print("\n")
bpe = BytePairEncoding()

# サンプルコーパスでBPEを学習
sample_corpus = [
    "the cat sat on the mat",
    "the dog sat on the log", 
    "cats and dogs are pets",
    "the quick brown fox jumps"
]

bpe.train(sample_corpus, vocab_size=50)

# BPEのデモ
print("\n")
bpe_demo = BPEDemo()
bpe_demo.demonstrate_bpe_process()

# 3. WordPiece
print("\n")
wp = WordPieceTokenizer()
wp.train(sample_corpus, vocab_size=100)

# 4. SentencePiece
print("\n")
sp_demo = SentencePieceDemo()
sp_demo.explain_sentencepiece()

# 5. 現代的なトークナイザー
print("\n")
modern = ModernTokenizers()
modern.compare_modern_tokenizers()

# 6. 実装のヒント
print("\n")
tips = TokenizerImplementationTips()
tips.share_best_practices()

print("\n" + "=" * 70)
print("まとめ")
print("=" * 70)
print("\nトークナイザーの要点:")
print("• 言語の多様性に対応する柔軟性")
print("• 計算効率と表現力のバランス")
print("• サブワード分割による未知語対応")
print("• タスクとモデルに適したトークナイザー選択")
print("\nトークナイザーは言語モデルの「目」であり、")
print("その設計がモデルの性能に大きく影響します。")

if name == "main": run_tokenizer_demo()