230 lines
6.9 KiB
Python
230 lines
6.9 KiB
Python
"""
|
||
文本清洗模块:实现文本清洗,去除无用字符、HTML标签等
|
||
"""
|
||
import re
|
||
import unicodedata
|
||
import html
|
||
from typing import List, Dict, Tuple, Optional, Any, Callable, Set, Union
|
||
import string
|
||
|
||
from utils.logger import get_logger
|
||
|
||
logger = get_logger("TextCleaner")
|
||
|
||
|
||
class TextCleaner:
|
||
"""文本清洗类,提供各种文本清洗方法"""
|
||
|
||
def __init__(self, remove_html: bool = True,
|
||
remove_urls: bool = True,
|
||
remove_emails: bool = True,
|
||
remove_numbers: bool = False,
|
||
remove_punctuation: bool = False,
|
||
lowercase: bool = False,
|
||
normalize_unicode: bool = True,
|
||
remove_excessive_spaces: bool = True,
|
||
remove_short_texts: bool = False,
|
||
min_text_length: int = 10,
|
||
custom_patterns: Optional[List[str]] = None):
|
||
"""
|
||
初始化文本清洗器
|
||
|
||
Args:
|
||
remove_html: 是否移除HTML标签
|
||
remove_urls: 是否移除URL
|
||
remove_emails: 是否移除电子邮件地址
|
||
remove_numbers: 是否移除数字
|
||
remove_punctuation: 是否移除标点符号
|
||
lowercase: 是否转为小写(对中文无效)
|
||
normalize_unicode: 是否规范化Unicode字符
|
||
remove_excessive_spaces: 是否移除多余空格
|
||
remove_short_texts: 是否过滤掉短文本
|
||
min_text_length: 最小文本长度(当remove_short_texts=True时有效)
|
||
custom_patterns: 自定义的正则表达式模式列表,用于额外的文本清洗
|
||
"""
|
||
self.remove_html = remove_html
|
||
self.remove_urls = remove_urls
|
||
self.remove_emails = remove_emails
|
||
self.remove_numbers = remove_numbers
|
||
self.remove_punctuation = remove_punctuation
|
||
self.lowercase = lowercase
|
||
self.normalize_unicode = normalize_unicode
|
||
self.remove_excessive_spaces = remove_excessive_spaces
|
||
self.remove_short_texts = remove_short_texts
|
||
self.min_text_length = min_text_length
|
||
self.custom_patterns = custom_patterns or []
|
||
|
||
# 编译正则表达式
|
||
self.html_pattern = re.compile(r'<.*?>')
|
||
self.url_pattern = re.compile(r'https?://\S+|www\.\S+')
|
||
self.email_pattern = re.compile(r'\S+@\S+\.\S+')
|
||
self.number_pattern = re.compile(r'\d+')
|
||
self.space_pattern = re.compile(r'\s+')
|
||
|
||
# 编译自定义模式
|
||
self.compiled_custom_patterns = [re.compile(pattern) for pattern in self.custom_patterns]
|
||
|
||
# 中文标点符号
|
||
self.chinese_punctuation = ",。!?;:""''【】《》()、…—~·"
|
||
|
||
logger.info("文本清洗器初始化完成")
|
||
|
||
def clean_text(self, text: str) -> str:
|
||
"""
|
||
清洗文本,应用所有已配置的清洗方法
|
||
|
||
Args:
|
||
text: 原始文本
|
||
|
||
Returns:
|
||
清洗后的文本
|
||
"""
|
||
if not text:
|
||
return ""
|
||
|
||
# HTML解码
|
||
if self.remove_html:
|
||
text = html.unescape(text)
|
||
text = self.html_pattern.sub(' ', text)
|
||
|
||
# 移除URL
|
||
if self.remove_urls:
|
||
text = self.url_pattern.sub(' ', text)
|
||
|
||
# 移除电子邮件
|
||
if self.remove_emails:
|
||
text = self.email_pattern.sub(' ', text)
|
||
|
||
# Unicode规范化
|
||
if self.normalize_unicode:
|
||
text = unicodedata.normalize('NFKC', text)
|
||
|
||
# 移除数字
|
||
if self.remove_numbers:
|
||
text = self.number_pattern.sub(' ', text)
|
||
|
||
# 移除标点符号
|
||
if self.remove_punctuation:
|
||
# 处理英文标点
|
||
for punct in string.punctuation:
|
||
text = text.replace(punct, ' ')
|
||
# 处理中文标点
|
||
for punct in self.chinese_punctuation:
|
||
text = text.replace(punct, ' ')
|
||
|
||
# 应用自定义清洗模式
|
||
for pattern in self.compiled_custom_patterns:
|
||
text = pattern.sub(' ', text)
|
||
|
||
# 转为小写
|
||
if self.lowercase:
|
||
text = text.lower()
|
||
|
||
# 移除多余空格
|
||
if self.remove_excessive_spaces:
|
||
text = self.space_pattern.sub(' ', text)
|
||
text = text.strip()
|
||
|
||
# 过滤掉短文本
|
||
if self.remove_short_texts and len(text) < self.min_text_length:
|
||
return ""
|
||
|
||
return text
|
||
|
||
def clean_texts(self, texts: List[str]) -> List[str]:
|
||
"""
|
||
批量清洗文本
|
||
|
||
Args:
|
||
texts: 原始文本列表
|
||
|
||
Returns:
|
||
清洗后的文本列表
|
||
"""
|
||
return [self.clean_text(text) for text in texts]
|
||
|
||
def remove_redundant_texts(self, texts: List[str]) -> List[str]:
|
||
"""
|
||
移除冗余文本(空文本和长度小于阈值的文本)
|
||
|
||
Args:
|
||
texts: 原始文本列表
|
||
|
||
Returns:
|
||
移除冗余后的文本列表
|
||
"""
|
||
return [text for text in texts if text and len(text) >= self.min_text_length]
|
||
|
||
@staticmethod
|
||
def remove_specific_characters(text: str, chars_to_remove: Union[str, Set[str]]) -> str:
|
||
"""
|
||
移除特定字符
|
||
|
||
Args:
|
||
text: 原始文本
|
||
chars_to_remove: 要移除的字符(字符串或字符集合)
|
||
|
||
Returns:
|
||
移除特定字符后的文本
|
||
"""
|
||
if isinstance(chars_to_remove, str):
|
||
for char in chars_to_remove:
|
||
text = text.replace(char, '')
|
||
else:
|
||
for char in chars_to_remove:
|
||
text = text.replace(char, '')
|
||
return text
|
||
|
||
@staticmethod
|
||
def replace_characters(text: str, char_map: Dict[str, str]) -> str:
|
||
"""
|
||
替换特定字符
|
||
|
||
Args:
|
||
text: 原始文本
|
||
char_map: 字符映射字典,键为要替换的字符,值为替换后的字符
|
||
|
||
Returns:
|
||
替换特定字符后的文本
|
||
"""
|
||
for old_char, new_char in char_map.items():
|
||
text = text.replace(old_char, new_char)
|
||
return text
|
||
|
||
@staticmethod
|
||
def remove_empty_lines(text: str) -> str:
|
||
"""
|
||
移除空行
|
||
|
||
Args:
|
||
text: 原始文本
|
||
|
||
Returns:
|
||
移除空行后的文本
|
||
"""
|
||
lines = text.splitlines()
|
||
non_empty_lines = [line for line in lines if line.strip()]
|
||
return '\n'.join(non_empty_lines)
|
||
|
||
@staticmethod
|
||
def truncate_text(text: str, max_length: int, truncate_from_end: bool = True) -> str:
|
||
"""
|
||
截断文本
|
||
|
||
Args:
|
||
text: 原始文本
|
||
max_length: 最大长度
|
||
truncate_from_end: 是否从末尾截断,如果为False则从开头截断
|
||
|
||
Returns:
|
||
截断后的文本
|
||
"""
|
||
if len(text) <= max_length:
|
||
return text
|
||
|
||
if truncate_from_end:
|
||
return text[:max_length]
|
||
else:
|
||
return text[len(text) - max_length:]
|
||
|