"""
文本清洗模块:实现文本清洗,去除无用字符、HTML标签等
"""
import re
import unicodedata
import html
from typing import List, Dict, Tuple, Optional, Any, Callable, Set, Union
import string
from utils.logger import get_logger
logger = get_logger("TextCleaner")
class TextCleaner:
"""文本清洗类,提供各种文本清洗方法"""
def __init__(self, remove_html: bool = True,
remove_urls: bool = True,
remove_emails: bool = True,
remove_numbers: bool = False,
remove_punctuation: bool = False,
lowercase: bool = False,
normalize_unicode: bool = True,
remove_excessive_spaces: bool = True,
remove_short_texts: bool = False,
min_text_length: int = 10,
custom_patterns: Optional[List[str]] = None):
"""
初始化文本清洗器
Args:
remove_html: 是否移除HTML标签
remove_urls: 是否移除URL
remove_emails: 是否移除电子邮件地址
remove_numbers: 是否移除数字
remove_punctuation: 是否移除标点符号
lowercase: 是否转为小写(对中文无效)
normalize_unicode: 是否规范化Unicode字符
remove_excessive_spaces: 是否移除多余空格
remove_short_texts: 是否过滤掉短文本
min_text_length: 最小文本长度(当remove_short_texts=True时有效)
custom_patterns: 自定义的正则表达式模式列表,用于额外的文本清洗
"""
self.remove_html = remove_html
self.remove_urls = remove_urls
self.remove_emails = remove_emails
self.remove_numbers = remove_numbers
self.remove_punctuation = remove_punctuation
self.lowercase = lowercase
self.normalize_unicode = normalize_unicode
self.remove_excessive_spaces = remove_excessive_spaces
self.remove_short_texts = remove_short_texts
self.min_text_length = min_text_length
self.custom_patterns = custom_patterns or []
# 编译正则表达式
self.html_pattern = re.compile(r'<.*?>')
self.url_pattern = re.compile(r'https?://\S+|www\.\S+')
self.email_pattern = re.compile(r'\S+@\S+\.\S+')
self.number_pattern = re.compile(r'\d+')
self.space_pattern = re.compile(r'\s+')
# 编译自定义模式
self.compiled_custom_patterns = [re.compile(pattern) for pattern in self.custom_patterns]
# 中文标点符号
self.chinese_punctuation = ",。!?;:""''【】《》()、…—~·"
logger.info("文本清洗器初始化完成")
def clean_text(self, text: str) -> str:
"""
清洗文本,应用所有已配置的清洗方法
Args:
text: 原始文本
Returns:
清洗后的文本
"""
if not text:
return ""
# HTML解码
if self.remove_html:
text = html.unescape(text)
text = self.html_pattern.sub(' ', text)
# 移除URL
if self.remove_urls:
text = self.url_pattern.sub(' ', text)
# 移除电子邮件
if self.remove_emails:
text = self.email_pattern.sub(' ', text)
# Unicode规范化
if self.normalize_unicode:
text = unicodedata.normalize('NFKC', text)
# 移除数字
if self.remove_numbers:
text = self.number_pattern.sub(' ', text)
# 移除标点符号
if self.remove_punctuation:
# 处理英文标点
for punct in string.punctuation:
text = text.replace(punct, ' ')
# 处理中文标点
for punct in self.chinese_punctuation:
text = text.replace(punct, ' ')
# 应用自定义清洗模式
for pattern in self.compiled_custom_patterns:
text = pattern.sub(' ', text)
# 转为小写
if self.lowercase:
text = text.lower()
# 移除多余空格
if self.remove_excessive_spaces:
text = self.space_pattern.sub(' ', text)
text = text.strip()
# 过滤掉短文本
if self.remove_short_texts and len(text) < self.min_text_length:
return ""
return text
def clean_texts(self, texts: List[str]) -> List[str]:
"""
批量清洗文本
Args:
texts: 原始文本列表
Returns:
清洗后的文本列表
"""
return [self.clean_text(text) for text in texts]
def remove_redundant_texts(self, texts: List[str]) -> List[str]:
"""
移除冗余文本(空文本和长度小于阈值的文本)
Args:
texts: 原始文本列表
Returns:
移除冗余后的文本列表
"""
return [text for text in texts if text and len(text) >= self.min_text_length]
@staticmethod
def remove_specific_characters(text: str, chars_to_remove: Union[str, Set[str]]) -> str:
"""
移除特定字符
Args:
text: 原始文本
chars_to_remove: 要移除的字符(字符串或字符集合)
Returns:
移除特定字符后的文本
"""
if isinstance(chars_to_remove, str):
for char in chars_to_remove:
text = text.replace(char, '')
else:
for char in chars_to_remove:
text = text.replace(char, '')
return text
@staticmethod
def replace_characters(text: str, char_map: Dict[str, str]) -> str:
"""
替换特定字符
Args:
text: 原始文本
char_map: 字符映射字典,键为要替换的字符,值为替换后的字符
Returns:
替换特定字符后的文本
"""
for old_char, new_char in char_map.items():
text = text.replace(old_char, new_char)
return text
@staticmethod
def remove_empty_lines(text: str) -> str:
"""
移除空行
Args:
text: 原始文本
Returns:
移除空行后的文本
"""
lines = text.splitlines()
non_empty_lines = [line for line in lines if line.strip()]
return '\n'.join(non_empty_lines)
@staticmethod
def truncate_text(text: str, max_length: int, truncate_from_end: bool = True) -> str:
"""
截断文本
Args:
text: 原始文本
max_length: 最大长度
truncate_from_end: 是否从末尾截断,如果为False则从开头截断
Returns:
截断后的文本
"""
if len(text) <= max_length:
return text
if truncate_from_end:
return text[:max_length]
else:
return text[len(text) - max_length:]