306 lines
7.4 KiB
Python
306 lines
7.4 KiB
Python
"""
|
|
文件处理工具模块
|
|
"""
|
|
import os
|
|
import shutil
|
|
import json
|
|
import pickle
|
|
import csv
|
|
from pathlib import Path
|
|
import time
|
|
import hashlib
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
import zipfile
|
|
import tarfile
|
|
|
|
from config.system_config import ENCODING, DATA_LOADING_WORKERS
|
|
from utils.logger import get_logger
|
|
|
|
logger = get_logger("file_utils")
|
|
|
|
|
|
def read_text_file(file_path, encoding=ENCODING):
|
|
"""
|
|
读取文本文件内容
|
|
|
|
Args:
|
|
file_path: 文件路径
|
|
encoding: 文件编码
|
|
|
|
Returns:
|
|
文件内容
|
|
"""
|
|
try:
|
|
with open(file_path, 'r', encoding=encoding) as file:
|
|
return file.read()
|
|
except Exception as e:
|
|
logger.error(f"读取文件 {file_path} 时出错: {str(e)}")
|
|
return None
|
|
|
|
|
|
def write_text_file(content, file_path, encoding=ENCODING):
|
|
"""
|
|
写入文本文件
|
|
|
|
Args:
|
|
content: 文件内容
|
|
file_path: 文件路径
|
|
encoding: 文件编码
|
|
|
|
Returns:
|
|
成功标志
|
|
"""
|
|
try:
|
|
# 确保目录存在
|
|
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
|
|
|
with open(file_path, 'w', encoding=encoding) as file:
|
|
file.write(content)
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"写入文件 {file_path} 时出错: {str(e)}")
|
|
return False
|
|
|
|
|
|
def save_json(data, file_path, encoding=ENCODING):
|
|
"""
|
|
保存JSON数据到文件
|
|
|
|
Args:
|
|
data: 要保存的数据
|
|
file_path: 文件路径
|
|
encoding: 文件编码
|
|
|
|
Returns:
|
|
成功标志
|
|
"""
|
|
try:
|
|
# 确保目录存在
|
|
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
|
|
|
with open(file_path, 'w', encoding=encoding) as file:
|
|
json.dump(data, file, ensure_ascii=False, indent=2)
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"保存JSON文件 {file_path} 时出错: {str(e)}")
|
|
return False
|
|
|
|
|
|
def load_json(file_path, encoding=ENCODING):
|
|
"""
|
|
从文件加载JSON数据
|
|
|
|
Args:
|
|
file_path: 文件路径
|
|
encoding: 文件编码
|
|
|
|
Returns:
|
|
加载的数据
|
|
"""
|
|
try:
|
|
with open(file_path, 'r', encoding=encoding) as file:
|
|
return json.load(file)
|
|
except Exception as e:
|
|
logger.error(f"加载JSON文件 {file_path} 时出错: {str(e)}")
|
|
return None
|
|
|
|
|
|
def save_pickle(data, file_path):
|
|
"""
|
|
使用pickle保存数据
|
|
|
|
Args:
|
|
data: 要保存的数据
|
|
file_path: 文件路径
|
|
|
|
Returns:
|
|
成功标志
|
|
"""
|
|
try:
|
|
# 确保目录存在
|
|
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
|
|
|
with open(file_path, 'wb') as file:
|
|
pickle.dump(data, file)
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"保存pickle文件 {file_path} 时出错: {str(e)}")
|
|
return False
|
|
|
|
|
|
def load_pickle(file_path):
|
|
"""
|
|
从文件加载pickle数据
|
|
|
|
Args:
|
|
file_path: 文件路径
|
|
|
|
Returns:
|
|
加载的数据
|
|
"""
|
|
try:
|
|
with open(file_path, 'rb') as file:
|
|
return pickle.load(file)
|
|
except Exception as e:
|
|
logger.error(f"加载pickle文件 {file_path} 时出错: {str(e)}")
|
|
return None
|
|
|
|
|
|
def read_files_parallel(file_paths, max_workers=DATA_LOADING_WORKERS, encoding=ENCODING):
|
|
"""
|
|
并行读取多个文本文件
|
|
|
|
Args:
|
|
file_paths: 文件路径列表
|
|
max_workers: 最大工作线程数
|
|
encoding: 文件编码
|
|
|
|
Returns:
|
|
文件内容列表
|
|
"""
|
|
start_time = time.time()
|
|
results = []
|
|
|
|
# 定义单个读取函数
|
|
def read_single_file(file_path):
|
|
return read_text_file(file_path, encoding)
|
|
|
|
# 使用线程池并行读取
|
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
future_to_file = {executor.submit(read_single_file, file_path): file_path
|
|
for file_path in file_paths}
|
|
|
|
# 收集结果
|
|
for future in as_completed(future_to_file):
|
|
file_path = future_to_file[future]
|
|
try:
|
|
content = future.result()
|
|
if content is not None:
|
|
results.append(content)
|
|
except Exception as e:
|
|
logger.error(f"处理文件 {file_path} 时出错: {str(e)}")
|
|
|
|
elapsed = time.time() - start_time
|
|
logger.info(f"并行读取 {len(file_paths)} 个文件,成功 {len(results)} 个,用时 {elapsed:.2f} 秒")
|
|
|
|
return results
|
|
|
|
|
|
def get_file_md5(file_path):
|
|
"""
|
|
计算文件的MD5哈希值
|
|
|
|
Args:
|
|
file_path: 文件路径
|
|
|
|
Returns:
|
|
MD5哈希值
|
|
"""
|
|
hash_md5 = hashlib.md5()
|
|
|
|
try:
|
|
with open(file_path, "rb") as f:
|
|
for chunk in iter(lambda: f.read(4096), b""):
|
|
hash_md5.update(chunk)
|
|
return hash_md5.hexdigest()
|
|
except Exception as e:
|
|
logger.error(f"计算文件 {file_path} 的MD5值时出错: {str(e)}")
|
|
return None
|
|
|
|
|
|
def extract_archive(archive_path, extract_to=None):
|
|
"""
|
|
解压缩文件
|
|
|
|
Args:
|
|
archive_path: 压缩文件路径
|
|
extract_to: 解压目标路径,默认为同目录
|
|
|
|
Returns:
|
|
成功标志
|
|
"""
|
|
if extract_to is None:
|
|
extract_to = os.path.dirname(archive_path)
|
|
|
|
try:
|
|
if archive_path.endswith('.zip'):
|
|
with zipfile.ZipFile(archive_path, 'r') as zip_ref:
|
|
zip_ref.extractall(extract_to)
|
|
elif archive_path.endswith(('.tar.gz', '.tgz')):
|
|
with tarfile.open(archive_path, 'r:gz') as tar_ref:
|
|
tar_ref.extractall(extract_to)
|
|
elif archive_path.endswith('.tar'):
|
|
with tarfile.open(archive_path, 'r') as tar_ref:
|
|
tar_ref.extractall(extract_to)
|
|
else:
|
|
logger.error(f"不支持的压缩格式: {archive_path}")
|
|
return False
|
|
|
|
logger.info(f"成功解压 {archive_path} 到 {extract_to}")
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"解压 {archive_path} 时出错: {str(e)}")
|
|
return False
|
|
|
|
|
|
def list_files(directory, pattern=None, recursive=True):
|
|
"""
|
|
列出目录中的文件
|
|
|
|
Args:
|
|
directory: 目录路径
|
|
pattern: 文件名模式(支持通配符)
|
|
recursive: 是否递归搜索子目录
|
|
|
|
Returns:
|
|
文件路径列表
|
|
"""
|
|
if not os.path.exists(directory):
|
|
logger.error(f"目录不存在: {directory}")
|
|
return []
|
|
|
|
directory = Path(directory)
|
|
|
|
if pattern:
|
|
if recursive:
|
|
return [str(p) for p in directory.glob(f"**/{pattern}")]
|
|
else:
|
|
return [str(p) for p in directory.glob(pattern)]
|
|
else:
|
|
if recursive:
|
|
files = []
|
|
for p in directory.rglob("*"):
|
|
if p.is_file():
|
|
files.append(str(p))
|
|
return files
|
|
else:
|
|
return [str(p) for p in directory.iterdir() if p.is_file()]
|
|
|
|
|
|
def ensure_dir(directory):
|
|
"""
|
|
确保目录存在,不存在则创建
|
|
|
|
Args:
|
|
directory: 目录路径
|
|
"""
|
|
os.makedirs(directory, exist_ok=True)
|
|
|
|
|
|
def remove_dir(directory):
|
|
"""
|
|
删除目录及其内容
|
|
|
|
Args:
|
|
directory: 目录路径
|
|
|
|
Returns:
|
|
成功标志
|
|
"""
|
|
try:
|
|
if os.path.exists(directory):
|
|
shutil.rmtree(directory)
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"删除目录 {directory} 时出错: {str(e)}")
|
|
return False |