""" 文本清洗模块:实现文本清洗,去除无用字符、HTML标签等 """ import re import unicodedata import html from typing import List, Dict, Tuple, Optional, Any, Callable, Set, Union import string from utils.logger import get_logger logger = get_logger("TextCleaner") class TextCleaner: """文本清洗类,提供各种文本清洗方法""" def __init__(self, remove_html: bool = True, remove_urls: bool = True, remove_emails: bool = True, remove_numbers: bool = False, remove_punctuation: bool = False, lowercase: bool = False, normalize_unicode: bool = True, remove_excessive_spaces: bool = True, remove_short_texts: bool = False, min_text_length: int = 10, custom_patterns: Optional[List[str]] = None): """ 初始化文本清洗器 Args: remove_html: 是否移除HTML标签 remove_urls: 是否移除URL remove_emails: 是否移除电子邮件地址 remove_numbers: 是否移除数字 remove_punctuation: 是否移除标点符号 lowercase: 是否转为小写(对中文无效) normalize_unicode: 是否规范化Unicode字符 remove_excessive_spaces: 是否移除多余空格 remove_short_texts: 是否过滤掉短文本 min_text_length: 最小文本长度(当remove_short_texts=True时有效) custom_patterns: 自定义的正则表达式模式列表,用于额外的文本清洗 """ self.remove_html = remove_html self.remove_urls = remove_urls self.remove_emails = remove_emails self.remove_numbers = remove_numbers self.remove_punctuation = remove_punctuation self.lowercase = lowercase self.normalize_unicode = normalize_unicode self.remove_excessive_spaces = remove_excessive_spaces self.remove_short_texts = remove_short_texts self.min_text_length = min_text_length self.custom_patterns = custom_patterns or [] # 编译正则表达式 self.html_pattern = re.compile(r'<.*?>') self.url_pattern = re.compile(r'https?://\S+|www\.\S+') self.email_pattern = re.compile(r'\S+@\S+\.\S+') self.number_pattern = re.compile(r'\d+') self.space_pattern = re.compile(r'\s+') # 编译自定义模式 self.compiled_custom_patterns = [re.compile(pattern) for pattern in self.custom_patterns] # 中文标点符号 self.chinese_punctuation = ",。!?;:""''【】《》()、…—~·" logger.info("文本清洗器初始化完成") def clean_text(self, text: str) -> str: """ 清洗文本,应用所有已配置的清洗方法 Args: text: 原始文本 Returns: 清洗后的文本 """ if not text: return "" # HTML解码 if self.remove_html: text = html.unescape(text) text = self.html_pattern.sub(' ', text) # 移除URL if self.remove_urls: text = self.url_pattern.sub(' ', text) # 移除电子邮件 if self.remove_emails: text = self.email_pattern.sub(' ', text) # Unicode规范化 if self.normalize_unicode: text = unicodedata.normalize('NFKC', text) # 移除数字 if self.remove_numbers: text = self.number_pattern.sub(' ', text) # 移除标点符号 if self.remove_punctuation: # 处理英文标点 for punct in string.punctuation: text = text.replace(punct, ' ') # 处理中文标点 for punct in self.chinese_punctuation: text = text.replace(punct, ' ') # 应用自定义清洗模式 for pattern in self.compiled_custom_patterns: text = pattern.sub(' ', text) # 转为小写 if self.lowercase: text = text.lower() # 移除多余空格 if self.remove_excessive_spaces: text = self.space_pattern.sub(' ', text) text = text.strip() # 过滤掉短文本 if self.remove_short_texts and len(text) < self.min_text_length: return "" return text def clean_texts(self, texts: List[str]) -> List[str]: """ 批量清洗文本 Args: texts: 原始文本列表 Returns: 清洗后的文本列表 """ return [self.clean_text(text) for text in texts] def remove_redundant_texts(self, texts: List[str]) -> List[str]: """ 移除冗余文本(空文本和长度小于阈值的文本) Args: texts: 原始文本列表 Returns: 移除冗余后的文本列表 """ return [text for text in texts if text and len(text) >= self.min_text_length] @staticmethod def remove_specific_characters(text: str, chars_to_remove: Union[str, Set[str]]) -> str: """ 移除特定字符 Args: text: 原始文本 chars_to_remove: 要移除的字符(字符串或字符集合) Returns: 移除特定字符后的文本 """ if isinstance(chars_to_remove, str): for char in chars_to_remove: text = text.replace(char, '') else: for char in chars_to_remove: text = text.replace(char, '') return text @staticmethod def replace_characters(text: str, char_map: Dict[str, str]) -> str: """ 替换特定字符 Args: text: 原始文本 char_map: 字符映射字典,键为要替换的字符,值为替换后的字符 Returns: 替换特定字符后的文本 """ for old_char, new_char in char_map.items(): text = text.replace(old_char, new_char) return text @staticmethod def remove_empty_lines(text: str) -> str: """ 移除空行 Args: text: 原始文本 Returns: 移除空行后的文本 """ lines = text.splitlines() non_empty_lines = [line for line in lines if line.strip()] return '\n'.join(non_empty_lines) @staticmethod def truncate_text(text: str, max_length: int, truncate_from_end: bool = True) -> str: """ 截断文本 Args: text: 原始文本 max_length: 最大长度 truncate_from_end: 是否从末尾截断,如果为False则从开头截断 Returns: 截断后的文本 """ if len(text) <= max_length: return text if truncate_from_end: return text[:max_length] else: return text[len(text) - max_length:]