""" 特征提取模块:实现文本特征提取,包括语法特征、语义特征等 """ import re import numpy as np from typing import List, Dict, Tuple, Optional, Any, Union, Set from collections import Counter import jieba.posseg as pseg from config.system_config import CATEGORIES from utils.logger import get_logger from preprocessing.tokenization import ChineseTokenizer logger = get_logger("FeatureExtraction") class FeatureExtractor: """特征提取基类,定义通用接口""" def __init__(self): """初始化特征提取器""" pass def extract(self, text: str) -> Dict[str, Any]: """ 从文本中提取特征 Args: text: 文本 Returns: 特征字典 """ raise NotImplementedError("子类必须实现此方法") def batch_extract(self, texts: List[str]) -> List[Dict[str, Any]]: """ 批量提取特征 Args: texts: 文本列表 Returns: 特征字典列表 """ return [self.extract(text) for text in texts] def extract_as_vector(self, text: str) -> np.ndarray: """ 从文本中提取特征,并转换为向量表示 Args: text: 文本 Returns: 特征向量 """ raise NotImplementedError("子类必须实现此方法") def batch_extract_as_vector(self, texts: List[str]) -> np.ndarray: """ 批量提取特征,并转换为向量表示 Args: texts: 文本列表 Returns: 特征向量数组 """ return np.array([self.extract_as_vector(text) for text in texts]) class StatisticalFeatureExtractor(FeatureExtractor): """统计特征提取器,提取文本的统计特征""" def __init__(self, tokenizer: Optional[ChineseTokenizer] = None): """ 初始化统计特征提取器 Args: tokenizer: 分词器,如果为None则创建一个新的分词器 """ super().__init__() self.tokenizer = tokenizer or ChineseTokenizer() def extract(self, text: str) -> Dict[str, Any]: """ 从文本中提取统计特征 Args: text: 文本 Returns: 特征字典,包含各种统计特征 """ if not text: return { "char_count": 0, "word_count": 0, "sentence_count": 0, "avg_word_length": 0, "avg_sentence_length": 0, "contains_number": False, "contains_english": False, "punctuation_ratio": 0, "top_words": [] } # 字符数 char_count = len(text) # 分词 words = self.tokenizer.tokenize(text, return_string=False) word_count = len(words) # 句子数(按标点符号分割) sentences = re.split(r'[。!?!?]+', text) sentences = [s for s in sentences if s.strip()] sentence_count = len(sentences) # 平均词长 avg_word_length = sum(len(word) for word in words) / word_count if word_count > 0 else 0 # 平均句长(以字符为单位) avg_sentence_length = char_count / sentence_count if sentence_count > 0 else 0 # 是否包含数字 contains_number = bool(re.search(r'\d', text)) # 是否包含英文 contains_english = bool(re.search(r'[a-zA-Z]', text)) # 标点符号比例 punctuation_pattern = re.compile(r'[^\w\s]') punctuations = punctuation_pattern.findall(text) punctuation_ratio = len(punctuations) / char_count if char_count > 0 else 0 # 高频词 word_counter = Counter(words) top_words = word_counter.most_common(5) return { "char_count": char_count, "word_count": word_count, "sentence_count": sentence_count, "avg_word_length": avg_word_length, "avg_sentence_length": avg_sentence_length, "contains_number": contains_number, "contains_english": contains_english, "punctuation_ratio": punctuation_ratio, "top_words": top_words } def extract_as_vector(self, text: str) -> np.ndarray: """ 从文本中提取统计特征,并转换为向量表示 Args: text: 文本 Returns: 特征向量,包含各种统计特征 """ features = self.extract(text) # 提取数值特征 vector = [ features['char_count'], features['word_count'], features['sentence_count'], features['avg_word_length'], features['avg_sentence_length'], int(features['contains_number']), int(features['contains_english']), features['punctuation_ratio'] ] return np.array(vector, dtype=np.float32) class POSFeatureExtractor(FeatureExtractor): """词性特征提取器,提取文本的词性特征""" def __init__(self): """初始化词性特征提取器""" super().__init__() # 常见中文词性及其解释 self.pos_tags = { 'n': '名词', 'f': '方位名词', 's': '处所名词', 't': '时间名词', 'nr': '人名', 'ns': '地名', 'nt': '机构团体', 'nw': '作品名', 'nz': '其他专名', 'v': '动词', 'vd': '副动词', 'vn': '名动词', 'a': '形容词', 'ad': '副形词', 'an': '名形词', 'd': '副词', 'm': '数词', 'q': '量词', 'r': '代词', 'p': '介词', 'c': '连词', 'u': '助词', 'xc': '其他虚词', 'w': '标点符号' } def extract(self, text: str) -> Dict[str, Any]: """ 从文本中提取词性特征 Args: text: 文本 Returns: 特征字典,包含各种词性特征 """ if not text: return { "pos_counts": {}, "pos_ratios": {} } # 使用jieba进行词性标注 pos_list = pseg.cut(text) # 统计各词性的数量 pos_counts = {} total_count = 0 for word, pos in pos_list: if pos in pos_counts: pos_counts[pos] += 1 else: pos_counts[pos] = 1 total_count += 1 # 计算各词性的比例 pos_ratios = {pos: count / total_count for pos, count in pos_counts.items()} if total_count > 0 else {} return { "pos_counts": pos_counts, "pos_ratios": pos_ratios } def extract_as_vector(self, text: str) -> np.ndarray: """ 从文本中提取词性特征,并转换为向量表示 Args: text: 文本 Returns: 特征向量,包含各词性的比例 """ features = self.extract(text) pos_ratios = features['pos_ratios'] # 按照 self.pos_tags 的顺序构建向量 vector = [] for pos in self.pos_tags.keys(): vector.append(pos_ratios.get(pos, 0.0)) return np.array(vector, dtype=np.float32) class KeywordFeatureExtractor(FeatureExtractor): """关键词特征提取器,基于预定义关键词提取特征""" def __init__(self, category_keywords: Optional[Dict[str, List[str]]] = None): """ 初始化关键词特征提取器 Args: category_keywords: 类别关键词字典,键为类别名称,值为关键词列表 """ super().__init__() self.category_keywords = category_keywords or self._get_default_keywords() self.tokenizer = ChineseTokenizer() def _get_default_keywords(self) -> Dict[str, List[str]]: """ 获取默认的类别关键词 Returns: 类别关键词字典 """ # 为每个类别定义一些示例关键词 default_keywords = { "体育": ["比赛", "运动", "球员", "冠军", "球队", "足球", "篮球"], "财经": ["股票", "基金", "投资", "市场", "经济", "金融", "股市"], "房产": ["房价", "楼市", "地产", "购房", "房贷", "物业", "小区"], "家居": ["装修", "家具", "设计", "卧室", "客厅", "厨房", "风格"], "教育": ["学校", "学生", "考试", "教育", "大学", "课程", "老师"], "科技": ["互联网", "科技", "创新", "数字", "智能", "研发", "技术"], "时尚": ["时尚", "潮流", "服装", "搭配", "品牌", "美容", "穿着"], "时政": ["政府", "政策", "国家", "发展", "会议", "主席", "总理"], "游戏": ["游戏", "玩家", "电竞", "网游", "手游", "角色", "任务"], "娱乐": ["明星", "电影", "节目", "综艺", "电视", "演员", "导演"], "其他": ["其他", "一般", "常见", "普通", "正常", "通常", "传统"] } # 确保 CATEGORIES 中的每个类别都有关键词 for category in CATEGORIES: if category not in default_keywords: default_keywords[category] = [category] return default_keywords def extract(self, text: str) -> Dict[str, Any]: """ 从文本中提取关键词特征 Args: text: 文本 Returns: 特征字典,包含各类别的关键词匹配情况 """ if not text: return { "keyword_matches": {cat: 0 for cat in self.category_keywords}, "keyword_match_ratios": {cat: 0.0 for cat in self.category_keywords} } # 对文本分词 words = set(self.tokenizer.tokenize(text, return_string=False)) # 统计各类别的关键词匹配数量 keyword_matches = {} for category, keywords in self.category_keywords.items(): # 计算文本中包含的该类别关键词数量 matches = sum(1 for kw in keywords if kw in words) keyword_matches[category] = matches # 计算匹配比例(归一化) total_matches = sum(keyword_matches.values()) keyword_match_ratios = { cat: matches / total_matches if total_matches > 0 else 0.0 for cat, matches in keyword_matches.items() } return { "keyword_matches": keyword_matches, "keyword_match_ratios": keyword_match_ratios } def extract_as_vector(self, text: str) -> np.ndarray: """ 从文本中提取关键词特征,并转换为向量表示 Args: text: 文本 Returns: 特征向量,包含各类别的关键词匹配比例 """ features = self.extract(text) match_ratios = features['keyword_match_ratios'] # 按照 CATEGORIES 的顺序构建向量 vector = [match_ratios.get(cat, 0.0) for cat in CATEGORIES] return np.array(vector, dtype=np.float32) def update_keywords(self, category: str, keywords: List[str]) -> None: """ 更新指定类别的关键词 Args: category: 类别名称 keywords: 关键词列表 """ self.category_keywords[category] = keywords logger.info(f"已更新类别 {category} 的关键词,共 {len(keywords)} 个") def add_keywords(self, category: str, keywords: List[str]) -> None: """ 向指定类别添加关键词 Args: category: 类别名称 keywords: 要添加的关键词列表 """ if category in self.category_keywords: existing_keywords = set(self.category_keywords[category]) for keyword in keywords: existing_keywords.add(keyword) self.category_keywords[category] = list(existing_keywords) else: self.category_keywords[category] = keywords logger.info(f"已向类别 {category} 添加关键词,当前共 {len(self.category_keywords[category])} 个") class CombinedFeatureExtractor(FeatureExtractor): """组合特征提取器,组合多个特征提取器的结果""" def __init__(self, extractors: List[FeatureExtractor]): """ 初始化组合特征提取器 Args: extractors: 特征提取器列表 """ super().__init__() self.extractors = extractors def extract(self, text: str) -> Dict[str, Any]: """ 从文本中提取组合特征 Args: text: 文本 Returns: 特征字典,包含所有特征提取器的结果 """ combined_features = {} for i, extractor in enumerate(self.extractors): extractor_name = type(extractor).__name__ features = extractor.extract(text) combined_features[extractor_name] = features return combined_features def extract_as_vector(self, text: str) -> np.ndarray: """ 从文本中提取组合特征,并转换为向量表示 Args: text: 文本 Returns: 特征向量,包含所有特征提取器的向量拼接 """ # 获取所有特征提取器的向量 feature_vectors = [extractor.extract_as_vector(text) for extractor in self.extractors] # 拼接向量 return np.concatenate(feature_vectors)