2025-03-08 01:34:36 +08:00

230 lines
6.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
文本清洗模块实现文本清洗去除无用字符、HTML标签等
"""
import re
import unicodedata
import html
from typing import List, Dict, Tuple, Optional, Any, Callable, Set, Union
import string
from utils.logger import get_logger
logger = get_logger("TextCleaner")
class TextCleaner:
"""文本清洗类,提供各种文本清洗方法"""
def __init__(self, remove_html: bool = True,
remove_urls: bool = True,
remove_emails: bool = True,
remove_numbers: bool = False,
remove_punctuation: bool = False,
lowercase: bool = False,
normalize_unicode: bool = True,
remove_excessive_spaces: bool = True,
remove_short_texts: bool = False,
min_text_length: int = 10,
custom_patterns: Optional[List[str]] = None):
"""
初始化文本清洗器
Args:
remove_html: 是否移除HTML标签
remove_urls: 是否移除URL
remove_emails: 是否移除电子邮件地址
remove_numbers: 是否移除数字
remove_punctuation: 是否移除标点符号
lowercase: 是否转为小写(对中文无效)
normalize_unicode: 是否规范化Unicode字符
remove_excessive_spaces: 是否移除多余空格
remove_short_texts: 是否过滤掉短文本
min_text_length: 最小文本长度当remove_short_texts=True时有效
custom_patterns: 自定义的正则表达式模式列表,用于额外的文本清洗
"""
self.remove_html = remove_html
self.remove_urls = remove_urls
self.remove_emails = remove_emails
self.remove_numbers = remove_numbers
self.remove_punctuation = remove_punctuation
self.lowercase = lowercase
self.normalize_unicode = normalize_unicode
self.remove_excessive_spaces = remove_excessive_spaces
self.remove_short_texts = remove_short_texts
self.min_text_length = min_text_length
self.custom_patterns = custom_patterns or []
# 编译正则表达式
self.html_pattern = re.compile(r'<.*?>')
self.url_pattern = re.compile(r'https?://\S+|www\.\S+')
self.email_pattern = re.compile(r'\S+@\S+\.\S+')
self.number_pattern = re.compile(r'\d+')
self.space_pattern = re.compile(r'\s+')
# 编译自定义模式
self.compiled_custom_patterns = [re.compile(pattern) for pattern in self.custom_patterns]
# 中文标点符号
self.chinese_punctuation = ",。!?;:""''【】《》()、…—~·"
logger.info("文本清洗器初始化完成")
def clean_text(self, text: str) -> str:
"""
清洗文本,应用所有已配置的清洗方法
Args:
text: 原始文本
Returns:
清洗后的文本
"""
if not text:
return ""
# HTML解码
if self.remove_html:
text = html.unescape(text)
text = self.html_pattern.sub(' ', text)
# 移除URL
if self.remove_urls:
text = self.url_pattern.sub(' ', text)
# 移除电子邮件
if self.remove_emails:
text = self.email_pattern.sub(' ', text)
# Unicode规范化
if self.normalize_unicode:
text = unicodedata.normalize('NFKC', text)
# 移除数字
if self.remove_numbers:
text = self.number_pattern.sub(' ', text)
# 移除标点符号
if self.remove_punctuation:
# 处理英文标点
for punct in string.punctuation:
text = text.replace(punct, ' ')
# 处理中文标点
for punct in self.chinese_punctuation:
text = text.replace(punct, ' ')
# 应用自定义清洗模式
for pattern in self.compiled_custom_patterns:
text = pattern.sub(' ', text)
# 转为小写
if self.lowercase:
text = text.lower()
# 移除多余空格
if self.remove_excessive_spaces:
text = self.space_pattern.sub(' ', text)
text = text.strip()
# 过滤掉短文本
if self.remove_short_texts and len(text) < self.min_text_length:
return ""
return text
def clean_texts(self, texts: List[str]) -> List[str]:
"""
批量清洗文本
Args:
texts: 原始文本列表
Returns:
清洗后的文本列表
"""
return [self.clean_text(text) for text in texts]
def remove_redundant_texts(self, texts: List[str]) -> List[str]:
"""
移除冗余文本(空文本和长度小于阈值的文本)
Args:
texts: 原始文本列表
Returns:
移除冗余后的文本列表
"""
return [text for text in texts if text and len(text) >= self.min_text_length]
@staticmethod
def remove_specific_characters(text: str, chars_to_remove: Union[str, Set[str]]) -> str:
"""
移除特定字符
Args:
text: 原始文本
chars_to_remove: 要移除的字符(字符串或字符集合)
Returns:
移除特定字符后的文本
"""
if isinstance(chars_to_remove, str):
for char in chars_to_remove:
text = text.replace(char, '')
else:
for char in chars_to_remove:
text = text.replace(char, '')
return text
@staticmethod
def replace_characters(text: str, char_map: Dict[str, str]) -> str:
"""
替换特定字符
Args:
text: 原始文本
char_map: 字符映射字典,键为要替换的字符,值为替换后的字符
Returns:
替换特定字符后的文本
"""
for old_char, new_char in char_map.items():
text = text.replace(old_char, new_char)
return text
@staticmethod
def remove_empty_lines(text: str) -> str:
"""
移除空行
Args:
text: 原始文本
Returns:
移除空行后的文本
"""
lines = text.splitlines()
non_empty_lines = [line for line in lines if line.strip()]
return '\n'.join(non_empty_lines)
@staticmethod
def truncate_text(text: str, max_length: int, truncate_from_end: bool = True) -> str:
"""
截断文本
Args:
text: 原始文本
max_length: 最大长度
truncate_from_end: 是否从末尾截断如果为False则从开头截断
Returns:
截断后的文本
"""
if len(text) <= max_length:
return text
if truncate_from_end:
return text[:max_length]
else:
return text[len(text) - max_length:]