""" 文件处理工具模块 """ import os import shutil import json import pickle import csv from pathlib import Path import time import hashlib from concurrent.futures import ThreadPoolExecutor, as_completed import zipfile import tarfile from config.system_config import ENCODING, DATA_LOADING_WORKERS from utils.logger import get_logger logger = get_logger("file_utils") def read_text_file(file_path, encoding=ENCODING): """ 读取文本文件内容 Args: file_path: 文件路径 encoding: 文件编码 Returns: 文件内容 """ try: with open(file_path, 'r', encoding=encoding) as file: return file.read() except Exception as e: logger.error(f"读取文件 {file_path} 时出错: {str(e)}") return None def write_text_file(content, file_path, encoding=ENCODING): """ 写入文本文件 Args: content: 文件内容 file_path: 文件路径 encoding: 文件编码 Returns: 成功标志 """ try: # 确保目录存在 os.makedirs(os.path.dirname(file_path), exist_ok=True) with open(file_path, 'w', encoding=encoding) as file: file.write(content) return True except Exception as e: logger.error(f"写入文件 {file_path} 时出错: {str(e)}") return False def save_json(data, file_path, encoding=ENCODING): """ 保存JSON数据到文件 Args: data: 要保存的数据 file_path: 文件路径 encoding: 文件编码 Returns: 成功标志 """ try: # 确保目录存在 os.makedirs(os.path.dirname(file_path), exist_ok=True) with open(file_path, 'w', encoding=encoding) as file: json.dump(data, file, ensure_ascii=False, indent=2) return True except Exception as e: logger.error(f"保存JSON文件 {file_path} 时出错: {str(e)}") return False def load_json(file_path, encoding=ENCODING): """ 从文件加载JSON数据 Args: file_path: 文件路径 encoding: 文件编码 Returns: 加载的数据 """ try: with open(file_path, 'r', encoding=encoding) as file: return json.load(file) except Exception as e: logger.error(f"加载JSON文件 {file_path} 时出错: {str(e)}") return None def save_pickle(data, file_path): """ 使用pickle保存数据 Args: data: 要保存的数据 file_path: 文件路径 Returns: 成功标志 """ try: # 确保目录存在 os.makedirs(os.path.dirname(file_path), exist_ok=True) with open(file_path, 'wb') as file: pickle.dump(data, file) return True except Exception as e: logger.error(f"保存pickle文件 {file_path} 时出错: {str(e)}") return False def load_pickle(file_path): """ 从文件加载pickle数据 Args: file_path: 文件路径 Returns: 加载的数据 """ try: with open(file_path, 'rb') as file: return pickle.load(file) except Exception as e: logger.error(f"加载pickle文件 {file_path} 时出错: {str(e)}") return None def read_files_parallel(file_paths, max_workers=DATA_LOADING_WORKERS, encoding=ENCODING): """ 并行读取多个文本文件 Args: file_paths: 文件路径列表 max_workers: 最大工作线程数 encoding: 文件编码 Returns: 文件内容列表 """ start_time = time.time() results = [] # 定义单个读取函数 def read_single_file(file_path): return read_text_file(file_path, encoding) # 使用线程池并行读取 with ThreadPoolExecutor(max_workers=max_workers) as executor: future_to_file = {executor.submit(read_single_file, file_path): file_path for file_path in file_paths} # 收集结果 for future in as_completed(future_to_file): file_path = future_to_file[future] try: content = future.result() if content is not None: results.append(content) except Exception as e: logger.error(f"处理文件 {file_path} 时出错: {str(e)}") elapsed = time.time() - start_time logger.info(f"并行读取 {len(file_paths)} 个文件,成功 {len(results)} 个,用时 {elapsed:.2f} 秒") return results def get_file_md5(file_path): """ 计算文件的MD5哈希值 Args: file_path: 文件路径 Returns: MD5哈希值 """ hash_md5 = hashlib.md5() try: with open(file_path, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): hash_md5.update(chunk) return hash_md5.hexdigest() except Exception as e: logger.error(f"计算文件 {file_path} 的MD5值时出错: {str(e)}") return None def extract_archive(archive_path, extract_to=None): """ 解压缩文件 Args: archive_path: 压缩文件路径 extract_to: 解压目标路径,默认为同目录 Returns: 成功标志 """ if extract_to is None: extract_to = os.path.dirname(archive_path) try: if archive_path.endswith('.zip'): with zipfile.ZipFile(archive_path, 'r') as zip_ref: zip_ref.extractall(extract_to) elif archive_path.endswith(('.tar.gz', '.tgz')): with tarfile.open(archive_path, 'r:gz') as tar_ref: tar_ref.extractall(extract_to) elif archive_path.endswith('.tar'): with tarfile.open(archive_path, 'r') as tar_ref: tar_ref.extractall(extract_to) else: logger.error(f"不支持的压缩格式: {archive_path}") return False logger.info(f"成功解压 {archive_path} 到 {extract_to}") return True except Exception as e: logger.error(f"解压 {archive_path} 时出错: {str(e)}") return False def list_files(directory, pattern=None, recursive=True): """ 列出目录中的文件 Args: directory: 目录路径 pattern: 文件名模式(支持通配符) recursive: 是否递归搜索子目录 Returns: 文件路径列表 """ if not os.path.exists(directory): logger.error(f"目录不存在: {directory}") return [] directory = Path(directory) if pattern: if recursive: return [str(p) for p in directory.glob(f"**/{pattern}")] else: return [str(p) for p in directory.glob(pattern)] else: if recursive: files = [] for p in directory.rglob("*"): if p.is_file(): files.append(str(p)) return files else: return [str(p) for p in directory.iterdir() if p.is_file()] def ensure_dir(directory): """ 确保目录存在,不存在则创建 Args: directory: 目录路径 """ os.makedirs(directory, exist_ok=True) def remove_dir(directory): """ 删除目录及其内容 Args: directory: 目录路径 Returns: 成功标志 """ try: if os.path.exists(directory): shutil.rmtree(directory) return True except Exception as e: logger.error(f"删除目录 {directory} 时出错: {str(e)}") return False