2025-03-08 01:34:36 +08:00

774 lines
25 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
文本向量化模块实现文本向量化包括词袋模型、TF-IDF和词嵌入等多种文本表示方法
"""
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pickle
import os
from typing import List, Dict, Tuple, Optional, Any, Union, Callable
import gensim
from gensim.models import Word2Vec, KeyedVectors
from config.system_config import PROCESSED_DATA_DIR, EMBEDDINGS_DIR
from config.model_config import (
MAX_SEQUENCE_LENGTH, MAX_NUM_WORDS, MIN_WORD_FREQUENCY
)
from utils.logger import get_logger
from utils.file_utils import save_pickle, load_pickle, ensure_dir
from preprocessing.tokenization import ChineseTokenizer
logger = get_logger("Vectorizer")
class TextVectorizer:
"""文本向量化基类,定义通用接口"""
def __init__(self, max_features: int = MAX_NUM_WORDS):
"""
初始化文本向量化器
Args:
max_features: 最大特征数(词汇表大小)
"""
self.max_features = max_features
self.vectorizer = None
self.is_fitted = False
def fit(self, texts: List[str]) -> None:
"""
在文本上训练向量化器
Args:
texts: 文本列表
"""
raise NotImplementedError("子类必须实现此方法")
def transform(self, texts: List[str]) -> np.ndarray:
"""
将文本转换为向量表示
Args:
texts: 文本列表
Returns:
向量表示
"""
raise NotImplementedError("子类必须实现此方法")
def fit_transform(self, texts: List[str]) -> np.ndarray:
"""
在文本上训练向量化器,并将文本转换为向量表示
Args:
texts: 文本列表
Returns:
向量表示
"""
self.fit(texts)
return self.transform(texts)
def save(self, path: str) -> None:
"""
保存向量化器
Args:
path: 保存路径
"""
ensure_dir(os.path.dirname(path))
save_pickle(self.vectorizer, path)
logger.info(f"向量化器已保存到:{path}")
def load(self, path: str) -> None:
"""
加载向量化器
Args:
path: 加载路径
"""
self.vectorizer = load_pickle(path)
self.is_fitted = True
logger.info(f"向量化器已从 {path} 加载")
def get_vocabulary(self) -> List[str]:
"""
获取词汇表
Returns:
词汇表
"""
raise NotImplementedError("子类必须实现此方法")
def get_vocabulary_size(self) -> int:
"""
获取词汇表大小
Returns:
词汇表大小
"""
raise NotImplementedError("子类必须实现此方法")
class BagOfWordsVectorizer(TextVectorizer):
"""词袋模型向量化器"""
def __init__(self, max_features: int = MAX_NUM_WORDS,
min_df: int = MIN_WORD_FREQUENCY,
tokenizer: Optional[Callable[[str], List[str]]] = None,
binary: bool = False):
"""
初始化词袋模型向量化器
Args:
max_features: 最大特征数(词汇表大小)
min_df: 最小文档频率
tokenizer: 分词器函数,接收文本,返回词语列表
binary: 是否使用二进制计数(只关注词语是否出现,不关注频率)
"""
super().__init__(max_features)
self.min_df = min_df
self.binary = binary
# 创建sklearn的CountVectorizer
self.vectorizer = CountVectorizer(
max_features=max_features,
min_df=min_df,
tokenizer=tokenizer,
binary=binary
)
def fit(self, texts: List[str]) -> None:
"""
在文本上训练词袋模型
Args:
texts: 文本列表
"""
self.vectorizer.fit(texts)
self.is_fitted = True
logger.info(f"词袋模型已训练,词汇表大小:{len(self.vectorizer.vocabulary_)}")
def transform(self, texts: List[str]) -> np.ndarray:
"""
将文本转换为词袋向量表示
Args:
texts: 文本列表
Returns:
词袋向量表示(稀疏矩阵)
"""
if not self.is_fitted:
raise ValueError("向量化器尚未训练请先调用fit方法")
return self.vectorizer.transform(texts)
def get_vocabulary(self) -> List[str]:
"""
获取词汇表
Returns:
词汇表(按索引排序)
"""
if not self.is_fitted:
raise ValueError("向量化器尚未训练请先调用fit方法")
# CountVectorizer的词汇表是一个字典键为词值为索引
vocab_dict = self.vectorizer.vocabulary_
vocab_list = [""] * len(vocab_dict)
for word, idx in vocab_dict.items():
vocab_list[idx] = word
return vocab_list
def get_vocabulary_size(self) -> int:
"""
获取词汇表大小
Returns:
词汇表大小
"""
if not self.is_fitted:
raise ValueError("向量化器尚未训练请先调用fit方法")
return len(self.vectorizer.vocabulary_)
class TfidfVectorizer(TextVectorizer):
"""TF-IDF向量化器"""
def __init__(self, max_features: int = MAX_NUM_WORDS,
min_df: int = MIN_WORD_FREQUENCY,
tokenizer: Optional[Callable[[str], List[str]]] = None,
norm: str = 'l2',
use_idf: bool = True,
smooth_idf: bool = True,
sublinear_tf: bool = False):
"""
初始化TF-IDF向量化器
Args:
max_features: 最大特征数(词汇表大小)
min_df: 最小文档频率
tokenizer: 分词器函数,接收文本,返回词语列表
norm: 规范化方法默认为L2范数
use_idf: 是否使用IDF逆文档频率
smooth_idf: 是否平滑IDF权重
sublinear_tf: 是否应用sublinear scaling对TF取对数
"""
super().__init__(max_features)
self.min_df = min_df
# 创建sklearn的TfidfVectorizer
self.vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(
max_features=max_features,
min_df=min_df,
tokenizer=tokenizer,
norm=norm,
use_idf=use_idf,
smooth_idf=smooth_idf,
sublinear_tf=sublinear_tf
)
def fit(self, texts: List[str]) -> None:
"""
在文本上训练TF-IDF模型
Args:
texts: 文本列表
"""
self.vectorizer.fit(texts)
self.is_fitted = True
logger.info(f"TF-IDF模型已训练词汇表大小{len(self.vectorizer.vocabulary_)}")
def transform(self, texts: List[str]) -> np.ndarray:
"""
将文本转换为TF-IDF向量表示
Args:
texts: 文本列表
Returns:
TF-IDF向量表示稀疏矩阵
"""
if not self.is_fitted:
raise ValueError("向量化器尚未训练请先调用fit方法")
return self.vectorizer.transform(texts)
def get_vocabulary(self) -> List[str]:
"""
获取词汇表
Returns:
词汇表(按索引排序)
"""
if not self.is_fitted:
raise ValueError("向量化器尚未训练请先调用fit方法")
# TfidfVectorizer的词汇表是一个字典键为词值为索引
vocab_dict = self.vectorizer.vocabulary_
vocab_list = [""] * len(vocab_dict)
for word, idx in vocab_dict.items():
vocab_list[idx] = word
return vocab_list
def get_vocabulary_size(self) -> int:
"""
获取词汇表大小
Returns:
词汇表大小
"""
if not self.is_fitted:
raise ValueError("向量化器尚未训练请先调用fit方法")
return len(self.vectorizer.vocabulary_)
def get_feature_names(self) -> List[str]:
"""
获取特征名称(词汇表)
Returns:
特征名称列表
"""
if not self.is_fitted:
raise ValueError("向量化器尚未训练请先调用fit方法")
return self.vectorizer.get_feature_names_out()
def get_idf(self) -> np.ndarray:
"""
获取IDF权重
Returns:
IDF权重数组
"""
if not self.is_fitted:
raise ValueError("向量化器尚未训练请先调用fit方法")
return self.vectorizer.idf_
class SequenceVectorizer(TextVectorizer):
"""序列向量化器使用Keras的Tokenizer"""
def __init__(self, max_features: int = MAX_NUM_WORDS,
max_sequence_length: int = MAX_SEQUENCE_LENGTH,
oov_token: str = "<OOV>",
padding: str = "post",
truncating: str = "post"):
"""
初始化序列向量化器
Args:
max_features: 最大特征数(词汇表大小)
max_sequence_length: 序列最大长度
oov_token: 未登录词标记
padding: 填充方式,'pre''post'
truncating: 截断方式,'pre''post'
"""
super().__init__(max_features)
self.max_sequence_length = max_sequence_length
self.oov_token = oov_token
self.padding = padding
self.truncating = truncating
# 创建Keras的Tokenizer
self.vectorizer = Tokenizer(num_words=max_features, oov_token=oov_token)
def fit(self, texts: List[str]) -> None:
"""
在文本上训练序列向量化器
Args:
texts: 文本列表
"""
self.vectorizer.fit_on_texts(texts)
self.is_fitted = True
logger.info(f"序列向量化器已训练,词汇表大小:{len(self.vectorizer.word_index)}")
def transform(self, texts: List[str]) -> np.ndarray:
"""
将文本转换为整数序列,并进行填充
Args:
texts: 文本列表
Returns:
整数序列表示
"""
if not self.is_fitted:
raise ValueError("向量化器尚未训练请先调用fit方法")
sequences = self.vectorizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(
sequences,
maxlen=self.max_sequence_length,
padding=self.padding,
truncating=self.truncating
)
return padded_sequences
def get_vocabulary(self) -> List[str]:
"""
获取词汇表
Returns:
词汇表(按索引排序)
"""
if not self.is_fitted:
raise ValueError("向量化器尚未训练请先调用fit方法")
# Tokenizer的词汇表是一个字典键为词值为索引从1开始
word_index = self.vectorizer.word_index
index_word = {index: word for word, index in word_index.items()}
# 注意索引0保留给padding索引1保留给OOV如果有设置
vocab = ["<PAD>"]
if self.oov_token:
vocab.append(self.oov_token)
max_index = min(self.max_features, len(word_index) + 1) if self.max_features else len(word_index) + 1
for i in range(1, max_index):
if i in index_word:
vocab.append(index_word[i])
return vocab
def get_vocabulary_size(self) -> int:
"""
获取词汇表大小
Returns:
词汇表大小
"""
if not self.is_fitted:
raise ValueError("向量化器尚未训练请先调用fit方法")
# +1是因为索引0保留给padding
return min(self.max_features, len(self.vectorizer.word_index) + 1) if self.max_features else len(
self.vectorizer.word_index) + 1
def texts_to_sequences(self, texts: List[str]) -> List[List[int]]:
"""
将文本转换为整数序列(不填充)
Args:
texts: 文本列表
Returns:
整数序列列表
"""
if not self.is_fitted:
raise ValueError("向量化器尚未训练请先调用fit方法")
return self.vectorizer.texts_to_sequences(texts)
def sequences_to_padded(self, sequences: List[List[int]]) -> np.ndarray:
"""
将整数序列填充到指定长度
Args:
sequences: 整数序列列表
Returns:
填充后的整数序列
"""
return pad_sequences(
sequences,
maxlen=self.max_sequence_length,
padding=self.padding,
truncating=self.truncating
)
def save(self, path: str) -> None:
"""
保存序列向量化器
Args:
path: 保存路径
"""
ensure_dir(os.path.dirname(path))
# 保存配置和状态
tokenizer_state = {
'tokenizer': self.vectorizer,
'max_features': self.max_features,
'max_sequence_length': self.max_sequence_length,
'oov_token': self.oov_token,
'padding': self.padding,
'truncating': self.truncating,
'is_fitted': self.is_fitted
}
save_pickle(tokenizer_state, path)
logger.info(f"序列向量化器已保存到:{path}")
def load(self, path: str) -> None:
"""
加载序列向量化器
Args:
path: 加载路径
"""
tokenizer_state = load_pickle(path)
self.vectorizer = tokenizer_state['tokenizer']
self.max_features = tokenizer_state['max_features']
self.max_sequence_length = tokenizer_state['max_sequence_length']
self.oov_token = tokenizer_state['oov_token']
self.padding = tokenizer_state['padding']
self.truncating = tokenizer_state['truncating']
self.is_fitted = tokenizer_state['is_fitted']
logger.info(f"序列向量化器已从 {path} 加载,词汇表大小:{len(self.vectorizer.word_index)}")
class Word2VecVectorizer(TextVectorizer):
"""Word2Vec词嵌入向量化器"""
def __init__(self, vector_size: int = 100,
window: int = 5,
min_count: int = MIN_WORD_FREQUENCY,
workers: int = 4,
sg: int = 1, # 1表示Skip-gram模型0表示CBOW模型
max_sequence_length: int = MAX_SEQUENCE_LENGTH,
padding: str = "post",
truncating: str = "post",
pretrained_path: Optional[str] = None):
"""
初始化Word2Vec词嵌入向量化器
Args:
vector_size: 词向量维度
window: 上下文窗口大小
min_count: 最小词频
workers: 并行训练的线程数
sg: 训练算法1表示Skip-gram0表示CBOW
max_sequence_length: 序列最大长度
padding: 填充方式,'pre''post'
truncating: 截断方式,'pre''post'
pretrained_path: 预训练词向量路径如果不为None则加载预训练词向量
"""
super().__init__(max_features=None) # Word2Vec没有max_features限制
self.vector_size = vector_size
self.window = window
self.min_count = min_count
self.workers = workers
self.sg = sg
self.max_sequence_length = max_sequence_length
self.padding = padding
self.truncating = truncating
self.pretrained_path = pretrained_path
# Word2Vec模型
self.model = None
# 词汇表
self.word_index = {}
self.index_word = {}
# 如果有预训练词向量,加载它
if pretrained_path and os.path.exists(pretrained_path):
self._load_pretrained(pretrained_path)
def _load_pretrained(self, path: str) -> None:
"""
加载预训练词向量
Args:
path: 预训练词向量路径
"""
try:
# 尝试加载Word2Vec模型
self.model = Word2Vec.load(path)
logger.info(f"已加载预训练Word2Vec模型{path}")
except:
try:
# 尝试加载词向量Word2Vec、GloVe或FastText格式
self.model = KeyedVectors.load_word2vec_format(path, binary=path.endswith('.bin'))
logger.info(f"已加载预训练词向量:{path}")
except Exception as e:
logger.error(f"加载预训练词向量失败:{e}")
return
# 如果加载成功,构建词汇表
self._build_vocab_from_model()
self.is_fitted = True
def _build_vocab_from_model(self) -> None:
"""从模型构建词汇表"""
# 获取词汇表
vocabulary = list(self.model.wv.index_to_key)
# 构建词汇表索引
self.word_index = {word: idx + 1 for idx, word in enumerate(vocabulary)} # 索引0保留给padding
self.index_word = {idx + 1: word for idx, word in enumerate(vocabulary)}
self.index_word[0] = "<PAD>"
def fit(self, tokenized_texts: List[List[str]]) -> None:
"""
在分词后的文本上训练Word2Vec模型
Args:
tokenized_texts: 分词后的文本列表(每个文本是一个词语列表)
"""
# 如果已经有预训练模型,跳过训练
if self.is_fitted and self.model is not None:
logger.info("已有预训练模型,跳过训练")
return
# 训练Word2Vec模型
self.model = Word2Vec(
sentences=tokenized_texts,
vector_size=self.vector_size,
window=self.window,
min_count=self.min_count,
workers=self.workers,
sg=self.sg
)
# 构建词汇表
self._build_vocab_from_model()
self.is_fitted = True
logger.info(f"Word2Vec模型已训练词汇表大小{len(self.word_index)}")
def transform(self, tokenized_texts: List[List[str]]) -> np.ndarray:
"""
将分词后的文本转换为词向量序列
Args:
tokenized_texts: 分词后的文本列表(每个文本是一个词语列表)
Returns:
词向量序列,形状为(样本数, 最大序列长度, 词向量维度)
"""
if not self.is_fitted or self.model is None:
raise ValueError("向量化器尚未训练请先调用fit方法")
# 初始化结果数组
result = np.zeros((len(tokenized_texts), self.max_sequence_length, self.vector_size))
# 处理每个文本
for i, text in enumerate(tokenized_texts):
seq_len = min(len(text), self.max_sequence_length)
# 根据截断方式处理
if self.truncating == 'pre' and len(text) > self.max_sequence_length:
text = text[-self.max_sequence_length:]
elif self.truncating == 'post' and len(text) > self.max_sequence_length:
text = text[:self.max_sequence_length]
# 获取每个词的词向量
for j, word in enumerate(text[:seq_len]):
if word in self.model.wv:
# 根据填充方式确定位置
pos = j if self.padding == 'post' else self.max_sequence_length - seq_len + j
result[i, pos] = self.model.wv[word]
return result
def transform_to_indices(self, tokenized_texts: List[List[str]]) -> np.ndarray:
"""
将分词后的文本转换为词索引序列,并填充
Args:
tokenized_texts: 分词后的文本列表(每个文本是一个词语列表)
Returns:
词索引序列
"""
if not self.is_fitted:
raise ValueError("向量化器尚未训练请先调用fit方法")
# 将词转换为索引
sequences = []
for text in tokenized_texts:
seq = [self.word_index.get(word, 0) for word in text] # 未登录词用0padding
sequences.append(seq)
# 填充序列
padded_sequences = pad_sequences(
sequences,
maxlen=self.max_sequence_length,
padding=self.padding,
truncating=self.truncating
)
return padded_sequences
def get_embedding_matrix(self) -> np.ndarray:
"""
获取嵌入矩阵用于Embedding层的权重初始化
Returns:
嵌入矩阵,形状为(词汇表大小, 词向量维度)
"""
if not self.is_fitted or self.model is None:
raise ValueError("向量化器尚未训练请先调用fit方法")
vocab_size = len(self.word_index) + 1 # +1是因为索引0保留给padding
embedding_matrix = np.zeros((vocab_size, self.vector_size))
# 填充嵌入矩阵
for word, idx in self.word_index.items():
if word in self.model.wv:
embedding_matrix[idx] = self.model.wv[word]
return embedding_matrix
def get_vocabulary(self) -> List[str]:
"""
获取词汇表
Returns:
词汇表(按索引排序)
"""
if not self.is_fitted:
raise ValueError("向量化器尚未训练请先调用fit方法")
vocab = ["<PAD>"] # 索引0保留给padding
for idx in range(1, len(self.index_word) + 1):
if idx in self.index_word:
vocab.append(self.index_word[idx])
return vocab
def get_vocabulary_size(self) -> int:
"""
获取词汇表大小
Returns:
词汇表大小
"""
if not self.is_fitted:
raise ValueError("向量化器尚未训练请先调用fit方法")
return len(self.word_index) + 1 # +1是因为索引0保留给padding
def save(self, path: str) -> None:
"""
保存Word2Vec向量化器
Args:
path: 保存路径
"""
ensure_dir(os.path.dirname(path))
# 保存模型和配置
model_path = os.path.join(os.path.dirname(path), "word2vec_model")
if self.model:
self.model.save(model_path)
# 保存配置和状态
state = {
'word_index': self.word_index,
'index_word': self.index_word,
'vector_size': self.vector_size,
'window': self.window,
'min_count': self.min_count,
'workers': self.workers,
'sg': self.sg,
'max_sequence_length': self.max_sequence_length,
'padding': self.padding,
'truncating': self.truncating,
'is_fitted': self.is_fitted,
'model_path': model_path if self.model else None
}
save_pickle(state, path)
logger.info(f"Word2Vec向量化器已保存到{path}")
def load(self, path: str) -> None:
"""
加载Word2Vec向量化器
Args:
path: 加载路径
"""
state = load_pickle(path)
self.word_index = state['word_index']
self.index_word = state['index_word']
self.vector_size = state['vector_size']
self.window = state['window']
self.min_count = state['min_count']
self.workers = state['workers']
self.sg = state['sg']
self.max_sequence_length = state['max_sequence_length']
self.padding = state['padding']
self.truncating = state['truncating']
self.is_fitted = state['is_fitted']
# 加载模型
model_path = state.get('model_path')
if model_path and os.path.exists(model_path):
self.model = Word2Vec.load(model_path)
logger.info(f"Word2Vec向量化器已从 {path} 加载,词汇表大小:{len(self.word_index)}")