774 lines
25 KiB
Python
774 lines
25 KiB
Python
"""
|
||
文本向量化模块:实现文本向量化,包括词袋模型、TF-IDF和词嵌入等多种文本表示方法
|
||
"""
|
||
import numpy as np
|
||
import tensorflow as tf
|
||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
|
||
import pickle
|
||
import os
|
||
from typing import List, Dict, Tuple, Optional, Any, Union, Callable
|
||
import gensim
|
||
from gensim.models import Word2Vec, KeyedVectors
|
||
|
||
from config.system_config import PROCESSED_DATA_DIR, EMBEDDINGS_DIR
|
||
from config.model_config import (
|
||
MAX_SEQUENCE_LENGTH, MAX_NUM_WORDS, MIN_WORD_FREQUENCY
|
||
)
|
||
from utils.logger import get_logger
|
||
from utils.file_utils import save_pickle, load_pickle, ensure_dir
|
||
from preprocessing.tokenization import ChineseTokenizer
|
||
|
||
logger = get_logger("Vectorizer")
|
||
|
||
|
||
class TextVectorizer:
|
||
"""文本向量化基类,定义通用接口"""
|
||
|
||
def __init__(self, max_features: int = MAX_NUM_WORDS):
|
||
"""
|
||
初始化文本向量化器
|
||
|
||
Args:
|
||
max_features: 最大特征数(词汇表大小)
|
||
"""
|
||
self.max_features = max_features
|
||
self.vectorizer = None
|
||
self.is_fitted = False
|
||
|
||
def fit(self, texts: List[str]) -> None:
|
||
"""
|
||
在文本上训练向量化器
|
||
|
||
Args:
|
||
texts: 文本列表
|
||
"""
|
||
raise NotImplementedError("子类必须实现此方法")
|
||
|
||
def transform(self, texts: List[str]) -> np.ndarray:
|
||
"""
|
||
将文本转换为向量表示
|
||
|
||
Args:
|
||
texts: 文本列表
|
||
|
||
Returns:
|
||
向量表示
|
||
"""
|
||
raise NotImplementedError("子类必须实现此方法")
|
||
|
||
def fit_transform(self, texts: List[str]) -> np.ndarray:
|
||
"""
|
||
在文本上训练向量化器,并将文本转换为向量表示
|
||
|
||
Args:
|
||
texts: 文本列表
|
||
|
||
Returns:
|
||
向量表示
|
||
"""
|
||
self.fit(texts)
|
||
return self.transform(texts)
|
||
|
||
def save(self, path: str) -> None:
|
||
"""
|
||
保存向量化器
|
||
|
||
Args:
|
||
path: 保存路径
|
||
"""
|
||
ensure_dir(os.path.dirname(path))
|
||
save_pickle(self.vectorizer, path)
|
||
logger.info(f"向量化器已保存到:{path}")
|
||
|
||
def load(self, path: str) -> None:
|
||
"""
|
||
加载向量化器
|
||
|
||
Args:
|
||
path: 加载路径
|
||
"""
|
||
self.vectorizer = load_pickle(path)
|
||
self.is_fitted = True
|
||
logger.info(f"向量化器已从 {path} 加载")
|
||
|
||
def get_vocabulary(self) -> List[str]:
|
||
"""
|
||
获取词汇表
|
||
|
||
Returns:
|
||
词汇表
|
||
"""
|
||
raise NotImplementedError("子类必须实现此方法")
|
||
|
||
def get_vocabulary_size(self) -> int:
|
||
"""
|
||
获取词汇表大小
|
||
|
||
Returns:
|
||
词汇表大小
|
||
"""
|
||
raise NotImplementedError("子类必须实现此方法")
|
||
|
||
|
||
class BagOfWordsVectorizer(TextVectorizer):
|
||
"""词袋模型向量化器"""
|
||
|
||
def __init__(self, max_features: int = MAX_NUM_WORDS,
|
||
min_df: int = MIN_WORD_FREQUENCY,
|
||
tokenizer: Optional[Callable[[str], List[str]]] = None,
|
||
binary: bool = False):
|
||
"""
|
||
初始化词袋模型向量化器
|
||
|
||
Args:
|
||
max_features: 最大特征数(词汇表大小)
|
||
min_df: 最小文档频率
|
||
tokenizer: 分词器函数,接收文本,返回词语列表
|
||
binary: 是否使用二进制计数(只关注词语是否出现,不关注频率)
|
||
"""
|
||
super().__init__(max_features)
|
||
self.min_df = min_df
|
||
self.binary = binary
|
||
|
||
# 创建sklearn的CountVectorizer
|
||
self.vectorizer = CountVectorizer(
|
||
max_features=max_features,
|
||
min_df=min_df,
|
||
tokenizer=tokenizer,
|
||
binary=binary
|
||
)
|
||
|
||
def fit(self, texts: List[str]) -> None:
|
||
"""
|
||
在文本上训练词袋模型
|
||
|
||
Args:
|
||
texts: 文本列表
|
||
"""
|
||
self.vectorizer.fit(texts)
|
||
self.is_fitted = True
|
||
logger.info(f"词袋模型已训练,词汇表大小:{len(self.vectorizer.vocabulary_)}")
|
||
|
||
def transform(self, texts: List[str]) -> np.ndarray:
|
||
"""
|
||
将文本转换为词袋向量表示
|
||
|
||
Args:
|
||
texts: 文本列表
|
||
|
||
Returns:
|
||
词袋向量表示(稀疏矩阵)
|
||
"""
|
||
if not self.is_fitted:
|
||
raise ValueError("向量化器尚未训练,请先调用fit方法")
|
||
|
||
return self.vectorizer.transform(texts)
|
||
|
||
def get_vocabulary(self) -> List[str]:
|
||
"""
|
||
获取词汇表
|
||
|
||
Returns:
|
||
词汇表(按索引排序)
|
||
"""
|
||
if not self.is_fitted:
|
||
raise ValueError("向量化器尚未训练,请先调用fit方法")
|
||
|
||
# CountVectorizer的词汇表是一个字典,键为词,值为索引
|
||
vocab_dict = self.vectorizer.vocabulary_
|
||
vocab_list = [""] * len(vocab_dict)
|
||
for word, idx in vocab_dict.items():
|
||
vocab_list[idx] = word
|
||
|
||
return vocab_list
|
||
|
||
def get_vocabulary_size(self) -> int:
|
||
"""
|
||
获取词汇表大小
|
||
|
||
Returns:
|
||
词汇表大小
|
||
"""
|
||
if not self.is_fitted:
|
||
raise ValueError("向量化器尚未训练,请先调用fit方法")
|
||
|
||
return len(self.vectorizer.vocabulary_)
|
||
|
||
|
||
class TfidfVectorizer(TextVectorizer):
|
||
"""TF-IDF向量化器"""
|
||
|
||
def __init__(self, max_features: int = MAX_NUM_WORDS,
|
||
min_df: int = MIN_WORD_FREQUENCY,
|
||
tokenizer: Optional[Callable[[str], List[str]]] = None,
|
||
norm: str = 'l2',
|
||
use_idf: bool = True,
|
||
smooth_idf: bool = True,
|
||
sublinear_tf: bool = False):
|
||
"""
|
||
初始化TF-IDF向量化器
|
||
|
||
Args:
|
||
max_features: 最大特征数(词汇表大小)
|
||
min_df: 最小文档频率
|
||
tokenizer: 分词器函数,接收文本,返回词语列表
|
||
norm: 规范化方法,默认为L2范数
|
||
use_idf: 是否使用IDF(逆文档频率)
|
||
smooth_idf: 是否平滑IDF权重
|
||
sublinear_tf: 是否应用sublinear scaling(对TF取对数)
|
||
"""
|
||
super().__init__(max_features)
|
||
self.min_df = min_df
|
||
|
||
# 创建sklearn的TfidfVectorizer
|
||
self.vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(
|
||
max_features=max_features,
|
||
min_df=min_df,
|
||
tokenizer=tokenizer,
|
||
norm=norm,
|
||
use_idf=use_idf,
|
||
smooth_idf=smooth_idf,
|
||
sublinear_tf=sublinear_tf
|
||
)
|
||
|
||
def fit(self, texts: List[str]) -> None:
|
||
"""
|
||
在文本上训练TF-IDF模型
|
||
|
||
Args:
|
||
texts: 文本列表
|
||
"""
|
||
self.vectorizer.fit(texts)
|
||
self.is_fitted = True
|
||
logger.info(f"TF-IDF模型已训练,词汇表大小:{len(self.vectorizer.vocabulary_)}")
|
||
|
||
def transform(self, texts: List[str]) -> np.ndarray:
|
||
"""
|
||
将文本转换为TF-IDF向量表示
|
||
|
||
Args:
|
||
texts: 文本列表
|
||
|
||
Returns:
|
||
TF-IDF向量表示(稀疏矩阵)
|
||
"""
|
||
if not self.is_fitted:
|
||
raise ValueError("向量化器尚未训练,请先调用fit方法")
|
||
|
||
return self.vectorizer.transform(texts)
|
||
|
||
def get_vocabulary(self) -> List[str]:
|
||
"""
|
||
获取词汇表
|
||
|
||
Returns:
|
||
词汇表(按索引排序)
|
||
"""
|
||
if not self.is_fitted:
|
||
raise ValueError("向量化器尚未训练,请先调用fit方法")
|
||
|
||
# TfidfVectorizer的词汇表是一个字典,键为词,值为索引
|
||
vocab_dict = self.vectorizer.vocabulary_
|
||
vocab_list = [""] * len(vocab_dict)
|
||
for word, idx in vocab_dict.items():
|
||
vocab_list[idx] = word
|
||
|
||
return vocab_list
|
||
|
||
def get_vocabulary_size(self) -> int:
|
||
"""
|
||
获取词汇表大小
|
||
|
||
Returns:
|
||
词汇表大小
|
||
"""
|
||
if not self.is_fitted:
|
||
raise ValueError("向量化器尚未训练,请先调用fit方法")
|
||
|
||
return len(self.vectorizer.vocabulary_)
|
||
|
||
def get_feature_names(self) -> List[str]:
|
||
"""
|
||
获取特征名称(词汇表)
|
||
|
||
Returns:
|
||
特征名称列表
|
||
"""
|
||
if not self.is_fitted:
|
||
raise ValueError("向量化器尚未训练,请先调用fit方法")
|
||
|
||
return self.vectorizer.get_feature_names_out()
|
||
|
||
def get_idf(self) -> np.ndarray:
|
||
"""
|
||
获取IDF权重
|
||
|
||
Returns:
|
||
IDF权重数组
|
||
"""
|
||
if not self.is_fitted:
|
||
raise ValueError("向量化器尚未训练,请先调用fit方法")
|
||
|
||
return self.vectorizer.idf_
|
||
|
||
|
||
class SequenceVectorizer(TextVectorizer):
|
||
"""序列向量化器,使用Keras的Tokenizer"""
|
||
|
||
def __init__(self, max_features: int = MAX_NUM_WORDS,
|
||
max_sequence_length: int = MAX_SEQUENCE_LENGTH,
|
||
oov_token: str = "<OOV>",
|
||
padding: str = "post",
|
||
truncating: str = "post"):
|
||
"""
|
||
初始化序列向量化器
|
||
|
||
Args:
|
||
max_features: 最大特征数(词汇表大小)
|
||
max_sequence_length: 序列最大长度
|
||
oov_token: 未登录词标记
|
||
padding: 填充方式,'pre'或'post'
|
||
truncating: 截断方式,'pre'或'post'
|
||
"""
|
||
super().__init__(max_features)
|
||
self.max_sequence_length = max_sequence_length
|
||
self.oov_token = oov_token
|
||
self.padding = padding
|
||
self.truncating = truncating
|
||
|
||
# 创建Keras的Tokenizer
|
||
self.vectorizer = Tokenizer(num_words=max_features, oov_token=oov_token)
|
||
|
||
def fit(self, texts: List[str]) -> None:
|
||
"""
|
||
在文本上训练序列向量化器
|
||
|
||
Args:
|
||
texts: 文本列表
|
||
"""
|
||
self.vectorizer.fit_on_texts(texts)
|
||
self.is_fitted = True
|
||
logger.info(f"序列向量化器已训练,词汇表大小:{len(self.vectorizer.word_index)}")
|
||
|
||
def transform(self, texts: List[str]) -> np.ndarray:
|
||
"""
|
||
将文本转换为整数序列,并进行填充
|
||
|
||
Args:
|
||
texts: 文本列表
|
||
|
||
Returns:
|
||
整数序列表示
|
||
"""
|
||
if not self.is_fitted:
|
||
raise ValueError("向量化器尚未训练,请先调用fit方法")
|
||
|
||
sequences = self.vectorizer.texts_to_sequences(texts)
|
||
padded_sequences = pad_sequences(
|
||
sequences,
|
||
maxlen=self.max_sequence_length,
|
||
padding=self.padding,
|
||
truncating=self.truncating
|
||
)
|
||
|
||
return padded_sequences
|
||
|
||
def get_vocabulary(self) -> List[str]:
|
||
"""
|
||
获取词汇表
|
||
|
||
Returns:
|
||
词汇表(按索引排序)
|
||
"""
|
||
if not self.is_fitted:
|
||
raise ValueError("向量化器尚未训练,请先调用fit方法")
|
||
|
||
# Tokenizer的词汇表是一个字典,键为词,值为索引(从1开始)
|
||
word_index = self.vectorizer.word_index
|
||
index_word = {index: word for word, index in word_index.items()}
|
||
|
||
# 注意索引0保留给padding,索引1保留给OOV(如果有设置)
|
||
vocab = ["<PAD>"]
|
||
if self.oov_token:
|
||
vocab.append(self.oov_token)
|
||
|
||
max_index = min(self.max_features, len(word_index) + 1) if self.max_features else len(word_index) + 1
|
||
for i in range(1, max_index):
|
||
if i in index_word:
|
||
vocab.append(index_word[i])
|
||
|
||
return vocab
|
||
|
||
def get_vocabulary_size(self) -> int:
|
||
"""
|
||
获取词汇表大小
|
||
|
||
Returns:
|
||
词汇表大小
|
||
"""
|
||
if not self.is_fitted:
|
||
raise ValueError("向量化器尚未训练,请先调用fit方法")
|
||
|
||
# +1是因为索引0保留给padding
|
||
return min(self.max_features, len(self.vectorizer.word_index) + 1) if self.max_features else len(
|
||
self.vectorizer.word_index) + 1
|
||
|
||
def texts_to_sequences(self, texts: List[str]) -> List[List[int]]:
|
||
"""
|
||
将文本转换为整数序列(不填充)
|
||
|
||
Args:
|
||
texts: 文本列表
|
||
|
||
Returns:
|
||
整数序列列表
|
||
"""
|
||
if not self.is_fitted:
|
||
raise ValueError("向量化器尚未训练,请先调用fit方法")
|
||
|
||
return self.vectorizer.texts_to_sequences(texts)
|
||
|
||
def sequences_to_padded(self, sequences: List[List[int]]) -> np.ndarray:
|
||
"""
|
||
将整数序列填充到指定长度
|
||
|
||
Args:
|
||
sequences: 整数序列列表
|
||
|
||
Returns:
|
||
填充后的整数序列
|
||
"""
|
||
return pad_sequences(
|
||
sequences,
|
||
maxlen=self.max_sequence_length,
|
||
padding=self.padding,
|
||
truncating=self.truncating
|
||
)
|
||
|
||
def save(self, path: str) -> None:
|
||
"""
|
||
保存序列向量化器
|
||
|
||
Args:
|
||
path: 保存路径
|
||
"""
|
||
ensure_dir(os.path.dirname(path))
|
||
|
||
# 保存配置和状态
|
||
tokenizer_state = {
|
||
'tokenizer': self.vectorizer,
|
||
'max_features': self.max_features,
|
||
'max_sequence_length': self.max_sequence_length,
|
||
'oov_token': self.oov_token,
|
||
'padding': self.padding,
|
||
'truncating': self.truncating,
|
||
'is_fitted': self.is_fitted
|
||
}
|
||
|
||
save_pickle(tokenizer_state, path)
|
||
logger.info(f"序列向量化器已保存到:{path}")
|
||
|
||
def load(self, path: str) -> None:
|
||
"""
|
||
加载序列向量化器
|
||
|
||
Args:
|
||
path: 加载路径
|
||
"""
|
||
tokenizer_state = load_pickle(path)
|
||
|
||
self.vectorizer = tokenizer_state['tokenizer']
|
||
self.max_features = tokenizer_state['max_features']
|
||
self.max_sequence_length = tokenizer_state['max_sequence_length']
|
||
self.oov_token = tokenizer_state['oov_token']
|
||
self.padding = tokenizer_state['padding']
|
||
self.truncating = tokenizer_state['truncating']
|
||
self.is_fitted = tokenizer_state['is_fitted']
|
||
|
||
logger.info(f"序列向量化器已从 {path} 加载,词汇表大小:{len(self.vectorizer.word_index)}")
|
||
|
||
class Word2VecVectorizer(TextVectorizer):
|
||
"""Word2Vec词嵌入向量化器"""
|
||
|
||
def __init__(self, vector_size: int = 100,
|
||
window: int = 5,
|
||
min_count: int = MIN_WORD_FREQUENCY,
|
||
workers: int = 4,
|
||
sg: int = 1, # 1表示Skip-gram模型,0表示CBOW模型
|
||
max_sequence_length: int = MAX_SEQUENCE_LENGTH,
|
||
padding: str = "post",
|
||
truncating: str = "post",
|
||
pretrained_path: Optional[str] = None):
|
||
"""
|
||
初始化Word2Vec词嵌入向量化器
|
||
|
||
Args:
|
||
vector_size: 词向量维度
|
||
window: 上下文窗口大小
|
||
min_count: 最小词频
|
||
workers: 并行训练的线程数
|
||
sg: 训练算法,1表示Skip-gram,0表示CBOW
|
||
max_sequence_length: 序列最大长度
|
||
padding: 填充方式,'pre'或'post'
|
||
truncating: 截断方式,'pre'或'post'
|
||
pretrained_path: 预训练词向量路径,如果不为None,则加载预训练词向量
|
||
"""
|
||
super().__init__(max_features=None) # Word2Vec没有max_features限制
|
||
self.vector_size = vector_size
|
||
self.window = window
|
||
self.min_count = min_count
|
||
self.workers = workers
|
||
self.sg = sg
|
||
self.max_sequence_length = max_sequence_length
|
||
self.padding = padding
|
||
self.truncating = truncating
|
||
self.pretrained_path = pretrained_path
|
||
|
||
# Word2Vec模型
|
||
self.model = None
|
||
|
||
# 词汇表
|
||
self.word_index = {}
|
||
self.index_word = {}
|
||
|
||
# 如果有预训练词向量,加载它
|
||
if pretrained_path and os.path.exists(pretrained_path):
|
||
self._load_pretrained(pretrained_path)
|
||
|
||
def _load_pretrained(self, path: str) -> None:
|
||
"""
|
||
加载预训练词向量
|
||
|
||
Args:
|
||
path: 预训练词向量路径
|
||
"""
|
||
try:
|
||
# 尝试加载Word2Vec模型
|
||
self.model = Word2Vec.load(path)
|
||
logger.info(f"已加载预训练Word2Vec模型:{path}")
|
||
except:
|
||
try:
|
||
# 尝试加载词向量(Word2Vec、GloVe或FastText格式)
|
||
self.model = KeyedVectors.load_word2vec_format(path, binary=path.endswith('.bin'))
|
||
logger.info(f"已加载预训练词向量:{path}")
|
||
except Exception as e:
|
||
logger.error(f"加载预训练词向量失败:{e}")
|
||
return
|
||
|
||
# 如果加载成功,构建词汇表
|
||
self._build_vocab_from_model()
|
||
self.is_fitted = True
|
||
|
||
def _build_vocab_from_model(self) -> None:
|
||
"""从模型构建词汇表"""
|
||
# 获取词汇表
|
||
vocabulary = list(self.model.wv.index_to_key)
|
||
|
||
# 构建词汇表索引
|
||
self.word_index = {word: idx + 1 for idx, word in enumerate(vocabulary)} # 索引0保留给padding
|
||
self.index_word = {idx + 1: word for idx, word in enumerate(vocabulary)}
|
||
self.index_word[0] = "<PAD>"
|
||
|
||
def fit(self, tokenized_texts: List[List[str]]) -> None:
|
||
"""
|
||
在分词后的文本上训练Word2Vec模型
|
||
|
||
Args:
|
||
tokenized_texts: 分词后的文本列表(每个文本是一个词语列表)
|
||
"""
|
||
# 如果已经有预训练模型,跳过训练
|
||
if self.is_fitted and self.model is not None:
|
||
logger.info("已有预训练模型,跳过训练")
|
||
return
|
||
|
||
# 训练Word2Vec模型
|
||
self.model = Word2Vec(
|
||
sentences=tokenized_texts,
|
||
vector_size=self.vector_size,
|
||
window=self.window,
|
||
min_count=self.min_count,
|
||
workers=self.workers,
|
||
sg=self.sg
|
||
)
|
||
|
||
# 构建词汇表
|
||
self._build_vocab_from_model()
|
||
self.is_fitted = True
|
||
|
||
logger.info(f"Word2Vec模型已训练,词汇表大小:{len(self.word_index)}")
|
||
|
||
def transform(self, tokenized_texts: List[List[str]]) -> np.ndarray:
|
||
"""
|
||
将分词后的文本转换为词向量序列
|
||
|
||
Args:
|
||
tokenized_texts: 分词后的文本列表(每个文本是一个词语列表)
|
||
|
||
Returns:
|
||
词向量序列,形状为(样本数, 最大序列长度, 词向量维度)
|
||
"""
|
||
if not self.is_fitted or self.model is None:
|
||
raise ValueError("向量化器尚未训练,请先调用fit方法")
|
||
|
||
# 初始化结果数组
|
||
result = np.zeros((len(tokenized_texts), self.max_sequence_length, self.vector_size))
|
||
|
||
# 处理每个文本
|
||
for i, text in enumerate(tokenized_texts):
|
||
seq_len = min(len(text), self.max_sequence_length)
|
||
|
||
# 根据截断方式处理
|
||
if self.truncating == 'pre' and len(text) > self.max_sequence_length:
|
||
text = text[-self.max_sequence_length:]
|
||
elif self.truncating == 'post' and len(text) > self.max_sequence_length:
|
||
text = text[:self.max_sequence_length]
|
||
|
||
# 获取每个词的词向量
|
||
for j, word in enumerate(text[:seq_len]):
|
||
if word in self.model.wv:
|
||
# 根据填充方式确定位置
|
||
pos = j if self.padding == 'post' else self.max_sequence_length - seq_len + j
|
||
result[i, pos] = self.model.wv[word]
|
||
|
||
return result
|
||
|
||
def transform_to_indices(self, tokenized_texts: List[List[str]]) -> np.ndarray:
|
||
"""
|
||
将分词后的文本转换为词索引序列,并填充
|
||
|
||
Args:
|
||
tokenized_texts: 分词后的文本列表(每个文本是一个词语列表)
|
||
|
||
Returns:
|
||
词索引序列
|
||
"""
|
||
if not self.is_fitted:
|
||
raise ValueError("向量化器尚未训练,请先调用fit方法")
|
||
|
||
# 将词转换为索引
|
||
sequences = []
|
||
for text in tokenized_texts:
|
||
seq = [self.word_index.get(word, 0) for word in text] # 未登录词用0(padding)
|
||
sequences.append(seq)
|
||
|
||
# 填充序列
|
||
padded_sequences = pad_sequences(
|
||
sequences,
|
||
maxlen=self.max_sequence_length,
|
||
padding=self.padding,
|
||
truncating=self.truncating
|
||
)
|
||
|
||
return padded_sequences
|
||
|
||
def get_embedding_matrix(self) -> np.ndarray:
|
||
"""
|
||
获取嵌入矩阵,用于Embedding层的权重初始化
|
||
|
||
Returns:
|
||
嵌入矩阵,形状为(词汇表大小, 词向量维度)
|
||
"""
|
||
if not self.is_fitted or self.model is None:
|
||
raise ValueError("向量化器尚未训练,请先调用fit方法")
|
||
|
||
vocab_size = len(self.word_index) + 1 # +1是因为索引0保留给padding
|
||
embedding_matrix = np.zeros((vocab_size, self.vector_size))
|
||
|
||
# 填充嵌入矩阵
|
||
for word, idx in self.word_index.items():
|
||
if word in self.model.wv:
|
||
embedding_matrix[idx] = self.model.wv[word]
|
||
|
||
return embedding_matrix
|
||
|
||
def get_vocabulary(self) -> List[str]:
|
||
"""
|
||
获取词汇表
|
||
|
||
Returns:
|
||
词汇表(按索引排序)
|
||
"""
|
||
if not self.is_fitted:
|
||
raise ValueError("向量化器尚未训练,请先调用fit方法")
|
||
|
||
vocab = ["<PAD>"] # 索引0保留给padding
|
||
for idx in range(1, len(self.index_word) + 1):
|
||
if idx in self.index_word:
|
||
vocab.append(self.index_word[idx])
|
||
|
||
return vocab
|
||
|
||
def get_vocabulary_size(self) -> int:
|
||
"""
|
||
获取词汇表大小
|
||
|
||
Returns:
|
||
词汇表大小
|
||
"""
|
||
if not self.is_fitted:
|
||
raise ValueError("向量化器尚未训练,请先调用fit方法")
|
||
|
||
return len(self.word_index) + 1 # +1是因为索引0保留给padding
|
||
|
||
def save(self, path: str) -> None:
|
||
"""
|
||
保存Word2Vec向量化器
|
||
|
||
Args:
|
||
path: 保存路径
|
||
"""
|
||
ensure_dir(os.path.dirname(path))
|
||
|
||
# 保存模型和配置
|
||
model_path = os.path.join(os.path.dirname(path), "word2vec_model")
|
||
if self.model:
|
||
self.model.save(model_path)
|
||
|
||
# 保存配置和状态
|
||
state = {
|
||
'word_index': self.word_index,
|
||
'index_word': self.index_word,
|
||
'vector_size': self.vector_size,
|
||
'window': self.window,
|
||
'min_count': self.min_count,
|
||
'workers': self.workers,
|
||
'sg': self.sg,
|
||
'max_sequence_length': self.max_sequence_length,
|
||
'padding': self.padding,
|
||
'truncating': self.truncating,
|
||
'is_fitted': self.is_fitted,
|
||
'model_path': model_path if self.model else None
|
||
}
|
||
|
||
save_pickle(state, path)
|
||
logger.info(f"Word2Vec向量化器已保存到:{path}")
|
||
|
||
def load(self, path: str) -> None:
|
||
"""
|
||
加载Word2Vec向量化器
|
||
|
||
Args:
|
||
path: 加载路径
|
||
"""
|
||
state = load_pickle(path)
|
||
|
||
self.word_index = state['word_index']
|
||
self.index_word = state['index_word']
|
||
self.vector_size = state['vector_size']
|
||
self.window = state['window']
|
||
self.min_count = state['min_count']
|
||
self.workers = state['workers']
|
||
self.sg = state['sg']
|
||
self.max_sequence_length = state['max_sequence_length']
|
||
self.padding = state['padding']
|
||
self.truncating = state['truncating']
|
||
self.is_fitted = state['is_fitted']
|
||
|
||
# 加载模型
|
||
model_path = state.get('model_path')
|
||
if model_path and os.path.exists(model_path):
|
||
self.model = Word2Vec.load(model_path)
|
||
|
||
logger.info(f"Word2Vec向量化器已从 {path} 加载,词汇表大小:{len(self.word_index)}") |