Chinese_Text_Classification.../utils/file_utils.py

"""
文件处理工具模块
"""
import os
import shutil
import json
import pickle
import csv
from pathlib import Path
import time
import hashlib
from concurrent.futures import ThreadPoolExecutor, as_completed
import zipfile
import tarfile

from config.system_config import ENCODING, DATA_LOADING_WORKERS
from utils.logger import get_logger

logger = get_logger("file_utils")


def read_text_file(file_path, encoding=ENCODING):
    """
    读取文本文件内容

    Args:
        file_path: 文件路径
        encoding: 文件编码

    Returns:
        文件内容
    """
    try:
        with open(file_path, 'r', encoding=encoding) as file:
            return file.read()
    except Exception as e:
        logger.error(f"读取文件 {file_path} 时出错: {str(e)}")
        return None


def write_text_file(content, file_path, encoding=ENCODING):
    """
    写入文本文件

    Args:
        content: 文件内容
        file_path: 文件路径
        encoding: 文件编码

    Returns:
        成功标志
    """
    try:
        # 确保目录存在
        os.makedirs(os.path.dirname(file_path), exist_ok=True)

        with open(file_path, 'w', encoding=encoding) as file:
            file.write(content)
        return True
    except Exception as e:
        logger.error(f"写入文件 {file_path} 时出错: {str(e)}")
        return False


def save_json(data, file_path, encoding=ENCODING):
    """
    保存JSON数据到文件

    Args:
        data: 要保存的数据
        file_path: 文件路径
        encoding: 文件编码

    Returns:
        成功标志
    """
    try:
        # 确保目录存在
        os.makedirs(os.path.dirname(file_path), exist_ok=True)

        with open(file_path, 'w', encoding=encoding) as file:
            json.dump(data, file, ensure_ascii=False, indent=2)
        return True
    except Exception as e:
        logger.error(f"保存JSON文件 {file_path} 时出错: {str(e)}")
        return False


def load_json(file_path, encoding=ENCODING):
    """
    从文件加载JSON数据

    Args:
        file_path: 文件路径
        encoding: 文件编码

    Returns:
        加载的数据
    """
    try:
        with open(file_path, 'r', encoding=encoding) as file:
            return json.load(file)
    except Exception as e:
        logger.error(f"加载JSON文件 {file_path} 时出错: {str(e)}")
        return None


def save_pickle(data, file_path):
    """
    使用pickle保存数据

    Args:
        data: 要保存的数据
        file_path: 文件路径

    Returns:
        成功标志
    """
    try:
        # 确保目录存在
        os.makedirs(os.path.dirname(file_path), exist_ok=True)

        with open(file_path, 'wb') as file:
            pickle.dump(data, file)
        return True
    except Exception as e:
        logger.error(f"保存pickle文件 {file_path} 时出错: {str(e)}")
        return False


def load_pickle(file_path):
    """
    从文件加载pickle数据

    Args:
        file_path: 文件路径

    Returns:
        加载的数据
    """
    try:
        with open(file_path, 'rb') as file:
            return pickle.load(file)
    except Exception as e:
        logger.error(f"加载pickle文件 {file_path} 时出错: {str(e)}")
        return None


def read_files_parallel(file_paths, max_workers=DATA_LOADING_WORKERS, encoding=ENCODING):
    """
    并行读取多个文本文件

    Args:
        file_paths: 文件路径列表
        max_workers: 最大工作线程数
        encoding: 文件编码

    Returns:
        文件内容列表
    """
    start_time = time.time()
    results = []

    # 定义单个读取函数
    def read_single_file(file_path):
        return read_text_file(file_path, encoding)

    # 使用线程池并行读取
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_file = {executor.submit(read_single_file, file_path): file_path
                          for file_path in file_paths}

        # 收集结果
        for future in as_completed(future_to_file):
            file_path = future_to_file[future]
            try:
                content = future.result()
                if content is not None:
                    results.append(content)
            except Exception as e:
                logger.error(f"处理文件 {file_path} 时出错: {str(e)}")

    elapsed = time.time() - start_time
    logger.info(f"并行读取 {len(file_paths)} 个文件，成功 {len(results)} 个，用时 {elapsed:.2f} 秒")

    return results


def get_file_md5(file_path):
    """
    计算文件的MD5哈希值

    Args:
        file_path: 文件路径

    Returns:
        MD5哈希值
    """
    hash_md5 = hashlib.md5()

    try:
        with open(file_path, "rb") as f:
            for chunk in iter(lambda: f.read(4096), b""):
                hash_md5.update(chunk)
        return hash_md5.hexdigest()
    except Exception as e:
        logger.error(f"计算文件 {file_path} 的MD5值时出错: {str(e)}")
        return None


def extract_archive(archive_path, extract_to=None):
    """
    解压缩文件

    Args:
        archive_path: 压缩文件路径
        extract_to: 解压目标路径，默认为同目录

    Returns:
        成功标志
    """
    if extract_to is None:
        extract_to = os.path.dirname(archive_path)

    try:
        if archive_path.endswith('.zip'):
            with zipfile.ZipFile(archive_path, 'r') as zip_ref:
                zip_ref.extractall(extract_to)
        elif archive_path.endswith(('.tar.gz', '.tgz')):
            with tarfile.open(archive_path, 'r:gz') as tar_ref:
                tar_ref.extractall(extract_to)
        elif archive_path.endswith('.tar'):
            with tarfile.open(archive_path, 'r') as tar_ref:
                tar_ref.extractall(extract_to)
        else:
            logger.error(f"不支持的压缩格式: {archive_path}")
            return False

        logger.info(f"成功解压 {archive_path} 到 {extract_to}")
        return True
    except Exception as e:
        logger.error(f"解压 {archive_path} 时出错: {str(e)}")
        return False


def list_files(directory, pattern=None, recursive=True):
    """
    列出目录中的文件

    Args:
        directory: 目录路径
        pattern: 文件名模式（支持通配符）
        recursive: 是否递归搜索子目录

    Returns:
        文件路径列表
    """
    if not os.path.exists(directory):
        logger.error(f"目录不存在: {directory}")
        return []

    directory = Path(directory)

    if pattern:
        if recursive:
            return [str(p) for p in directory.glob(f"**/{pattern}")]
        else:
            return [str(p) for p in directory.glob(pattern)]
    else:
        if recursive:
            files = []
            for p in directory.rglob("*"):
                if p.is_file():
                    files.append(str(p))
            return files
        else:
            return [str(p) for p in directory.iterdir() if p.is_file()]


def ensure_dir(directory):
    """
    确保目录存在，不存在则创建

    Args:
        directory: 目录路径
    """
    os.makedirs(directory, exist_ok=True)


def remove_dir(directory):
    """
    删除目录及其内容

    Args:
        directory: 目录路径

    Returns:
        成功标志
    """
    try:
        if os.path.exists(directory):
            shutil.rmtree(directory)
        return True
    except Exception as e:
        logger.error(f"删除目录 {directory} 时出错: {str(e)}")
        return False