2025-03-08 01:34:36 +08:00

306 lines
7.4 KiB
Python

"""
文件处理工具模块
"""
import os
import shutil
import json
import pickle
import csv
from pathlib import Path
import time
import hashlib
from concurrent.futures import ThreadPoolExecutor, as_completed
import zipfile
import tarfile
from config.system_config import ENCODING, DATA_LOADING_WORKERS
from utils.logger import get_logger
logger = get_logger("file_utils")
def read_text_file(file_path, encoding=ENCODING):
"""
读取文本文件内容
Args:
file_path: 文件路径
encoding: 文件编码
Returns:
文件内容
"""
try:
with open(file_path, 'r', encoding=encoding) as file:
return file.read()
except Exception as e:
logger.error(f"读取文件 {file_path} 时出错: {str(e)}")
return None
def write_text_file(content, file_path, encoding=ENCODING):
"""
写入文本文件
Args:
content: 文件内容
file_path: 文件路径
encoding: 文件编码
Returns:
成功标志
"""
try:
# 确保目录存在
os.makedirs(os.path.dirname(file_path), exist_ok=True)
with open(file_path, 'w', encoding=encoding) as file:
file.write(content)
return True
except Exception as e:
logger.error(f"写入文件 {file_path} 时出错: {str(e)}")
return False
def save_json(data, file_path, encoding=ENCODING):
"""
保存JSON数据到文件
Args:
data: 要保存的数据
file_path: 文件路径
encoding: 文件编码
Returns:
成功标志
"""
try:
# 确保目录存在
os.makedirs(os.path.dirname(file_path), exist_ok=True)
with open(file_path, 'w', encoding=encoding) as file:
json.dump(data, file, ensure_ascii=False, indent=2)
return True
except Exception as e:
logger.error(f"保存JSON文件 {file_path} 时出错: {str(e)}")
return False
def load_json(file_path, encoding=ENCODING):
"""
从文件加载JSON数据
Args:
file_path: 文件路径
encoding: 文件编码
Returns:
加载的数据
"""
try:
with open(file_path, 'r', encoding=encoding) as file:
return json.load(file)
except Exception as e:
logger.error(f"加载JSON文件 {file_path} 时出错: {str(e)}")
return None
def save_pickle(data, file_path):
"""
使用pickle保存数据
Args:
data: 要保存的数据
file_path: 文件路径
Returns:
成功标志
"""
try:
# 确保目录存在
os.makedirs(os.path.dirname(file_path), exist_ok=True)
with open(file_path, 'wb') as file:
pickle.dump(data, file)
return True
except Exception as e:
logger.error(f"保存pickle文件 {file_path} 时出错: {str(e)}")
return False
def load_pickle(file_path):
"""
从文件加载pickle数据
Args:
file_path: 文件路径
Returns:
加载的数据
"""
try:
with open(file_path, 'rb') as file:
return pickle.load(file)
except Exception as e:
logger.error(f"加载pickle文件 {file_path} 时出错: {str(e)}")
return None
def read_files_parallel(file_paths, max_workers=DATA_LOADING_WORKERS, encoding=ENCODING):
"""
并行读取多个文本文件
Args:
file_paths: 文件路径列表
max_workers: 最大工作线程数
encoding: 文件编码
Returns:
文件内容列表
"""
start_time = time.time()
results = []
# 定义单个读取函数
def read_single_file(file_path):
return read_text_file(file_path, encoding)
# 使用线程池并行读取
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_file = {executor.submit(read_single_file, file_path): file_path
for file_path in file_paths}
# 收集结果
for future in as_completed(future_to_file):
file_path = future_to_file[future]
try:
content = future.result()
if content is not None:
results.append(content)
except Exception as e:
logger.error(f"处理文件 {file_path} 时出错: {str(e)}")
elapsed = time.time() - start_time
logger.info(f"并行读取 {len(file_paths)} 个文件,成功 {len(results)} 个,用时 {elapsed:.2f}")
return results
def get_file_md5(file_path):
"""
计算文件的MD5哈希值
Args:
file_path: 文件路径
Returns:
MD5哈希值
"""
hash_md5 = hashlib.md5()
try:
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
except Exception as e:
logger.error(f"计算文件 {file_path} 的MD5值时出错: {str(e)}")
return None
def extract_archive(archive_path, extract_to=None):
"""
解压缩文件
Args:
archive_path: 压缩文件路径
extract_to: 解压目标路径,默认为同目录
Returns:
成功标志
"""
if extract_to is None:
extract_to = os.path.dirname(archive_path)
try:
if archive_path.endswith('.zip'):
with zipfile.ZipFile(archive_path, 'r') as zip_ref:
zip_ref.extractall(extract_to)
elif archive_path.endswith(('.tar.gz', '.tgz')):
with tarfile.open(archive_path, 'r:gz') as tar_ref:
tar_ref.extractall(extract_to)
elif archive_path.endswith('.tar'):
with tarfile.open(archive_path, 'r') as tar_ref:
tar_ref.extractall(extract_to)
else:
logger.error(f"不支持的压缩格式: {archive_path}")
return False
logger.info(f"成功解压 {archive_path}{extract_to}")
return True
except Exception as e:
logger.error(f"解压 {archive_path} 时出错: {str(e)}")
return False
def list_files(directory, pattern=None, recursive=True):
"""
列出目录中的文件
Args:
directory: 目录路径
pattern: 文件名模式(支持通配符)
recursive: 是否递归搜索子目录
Returns:
文件路径列表
"""
if not os.path.exists(directory):
logger.error(f"目录不存在: {directory}")
return []
directory = Path(directory)
if pattern:
if recursive:
return [str(p) for p in directory.glob(f"**/{pattern}")]
else:
return [str(p) for p in directory.glob(pattern)]
else:
if recursive:
files = []
for p in directory.rglob("*"):
if p.is_file():
files.append(str(p))
return files
else:
return [str(p) for p in directory.iterdir() if p.is_file()]
def ensure_dir(directory):
"""
确保目录存在,不存在则创建
Args:
directory: 目录路径
"""
os.makedirs(directory, exist_ok=True)
def remove_dir(directory):
"""
删除目录及其内容
Args:
directory: 目录路径
Returns:
成功标志
"""
try:
if os.path.exists(directory):
shutil.rmtree(directory)
return True
except Exception as e:
logger.error(f"删除目录 {directory} 时出错: {str(e)}")
return False