Chinese_Text_Classification.../preprocessing/text_cleaner.py

"""
文本清洗模块：实现文本清洗，去除无用字符、HTML标签等
"""
import re
import unicodedata
import html
from typing import List, Dict, Tuple, Optional, Any, Callable, Set, Union
import string

from utils.logger import get_logger

logger = get_logger("TextCleaner")


class TextCleaner:
    """文本清洗类，提供各种文本清洗方法"""

    def __init__(self, remove_html: bool = True,
                 remove_urls: bool = True,
                 remove_emails: bool = True,
                 remove_numbers: bool = False,
                 remove_punctuation: bool = False,
                 lowercase: bool = False,
                 normalize_unicode: bool = True,
                 remove_excessive_spaces: bool = True,
                 remove_short_texts: bool = False,
                 min_text_length: int = 10,
                 custom_patterns: Optional[List[str]] = None):
        """
        初始化文本清洗器

        Args:
            remove_html: 是否移除HTML标签
            remove_urls: 是否移除URL
            remove_emails: 是否移除电子邮件地址
            remove_numbers: 是否移除数字
            remove_punctuation: 是否移除标点符号
            lowercase: 是否转为小写（对中文无效）
            normalize_unicode: 是否规范化Unicode字符
            remove_excessive_spaces: 是否移除多余空格
            remove_short_texts: 是否过滤掉短文本
            min_text_length: 最小文本长度（当remove_short_texts=True时有效）
            custom_patterns: 自定义的正则表达式模式列表，用于额外的文本清洗
        """
        self.remove_html = remove_html
        self.remove_urls = remove_urls
        self.remove_emails = remove_emails
        self.remove_numbers = remove_numbers
        self.remove_punctuation = remove_punctuation
        self.lowercase = lowercase
        self.normalize_unicode = normalize_unicode
        self.remove_excessive_spaces = remove_excessive_spaces
        self.remove_short_texts = remove_short_texts
        self.min_text_length = min_text_length
        self.custom_patterns = custom_patterns or []

        # 编译正则表达式
        self.html_pattern = re.compile(r'<.*?>')
        self.url_pattern = re.compile(r'https?://\S+|www\.\S+')
        self.email_pattern = re.compile(r'\S+@\S+\.\S+')
        self.number_pattern = re.compile(r'\d+')
        self.space_pattern = re.compile(r'\s+')

        # 编译自定义模式
        self.compiled_custom_patterns = [re.compile(pattern) for pattern in self.custom_patterns]

        # 中文标点符号
        self.chinese_punctuation = "，。！？；：""''【】《》（）、…—～·"

        logger.info("文本清洗器初始化完成")

    def clean_text(self, text: str) -> str:
        """
        清洗文本，应用所有已配置的清洗方法

        Args:
            text: 原始文本

        Returns:
            清洗后的文本
        """
        if not text:
            return ""

        # HTML解码
        if self.remove_html:
            text = html.unescape(text)
            text = self.html_pattern.sub(' ', text)

        # 移除URL
        if self.remove_urls:
            text = self.url_pattern.sub(' ', text)

        # 移除电子邮件
        if self.remove_emails:
            text = self.email_pattern.sub(' ', text)

        # Unicode规范化
        if self.normalize_unicode:
            text = unicodedata.normalize('NFKC', text)

        # 移除数字
        if self.remove_numbers:
            text = self.number_pattern.sub(' ', text)

        # 移除标点符号
        if self.remove_punctuation:
            # 处理英文标点
            for punct in string.punctuation:
                text = text.replace(punct, ' ')
            # 处理中文标点
            for punct in self.chinese_punctuation:
                text = text.replace(punct, ' ')

        # 应用自定义清洗模式
        for pattern in self.compiled_custom_patterns:
            text = pattern.sub(' ', text)

        # 转为小写
        if self.lowercase:
            text = text.lower()

        # 移除多余空格
        if self.remove_excessive_spaces:
            text = self.space_pattern.sub(' ', text)
            text = text.strip()

        # 过滤掉短文本
        if self.remove_short_texts and len(text) < self.min_text_length:
            return ""

        return text

    def clean_texts(self, texts: List[str]) -> List[str]:
        """
        批量清洗文本

        Args:
            texts: 原始文本列表

        Returns:
            清洗后的文本列表
        """
        return [self.clean_text(text) for text in texts]

    def remove_redundant_texts(self, texts: List[str]) -> List[str]:
        """
        移除冗余文本（空文本和长度小于阈值的文本）

        Args:
            texts: 原始文本列表

        Returns:
            移除冗余后的文本列表
        """
        return [text for text in texts if text and len(text) >= self.min_text_length]

    @staticmethod
    def remove_specific_characters(text: str, chars_to_remove: Union[str, Set[str]]) -> str:
        """
        移除特定字符

        Args:
            text: 原始文本
            chars_to_remove: 要移除的字符（字符串或字符集合）

        Returns:
            移除特定字符后的文本
        """
        if isinstance(chars_to_remove, str):
            for char in chars_to_remove:
                text = text.replace(char, '')
        else:
            for char in chars_to_remove:
                text = text.replace(char, '')
        return text

    @staticmethod
    def replace_characters(text: str, char_map: Dict[str, str]) -> str:
        """
        替换特定字符

        Args:
            text: 原始文本
            char_map: 字符映射字典，键为要替换的字符，值为替换后的字符

        Returns:
            替换特定字符后的文本
        """
        for old_char, new_char in char_map.items():
            text = text.replace(old_char, new_char)
        return text

    @staticmethod
    def remove_empty_lines(text: str) -> str:
        """
        移除空行

        Args:
            text: 原始文本

        Returns:
            移除空行后的文本
        """
        lines = text.splitlines()
        non_empty_lines = [line for line in lines if line.strip()]
        return '\n'.join(non_empty_lines)

    @staticmethod
    def truncate_text(text: str, max_length: int, truncate_from_end: bool = True) -> str:
        """
        截断文本

        Args:
            text: 原始文本
            max_length: 最大长度
            truncate_from_end: 是否从末尾截断，如果为False则从开头截断

        Returns:
            截断后的文本
        """
        if len(text) <= max_length:
            return text

        if truncate_from_end:
            return text[:max_length]
        else:
            return text[len(text) - max_length:]