86 lines
2.6 KiB
Python
86 lines
2.6 KiB
Python
import os
|
||
import re
|
||
from docx import Document
|
||
|
||
def find_files(start_path, extensions, exclude_dirs):
|
||
"""
|
||
遍历目录并找到指定扩展名的文件,不包括特定目录。
|
||
|
||
:param start_path: 起始路径
|
||
:param extensions: 需要查找的文件扩展名列表
|
||
:param exclude_dirs: 需要排除的子目录列表
|
||
:return: 文件路径列表
|
||
"""
|
||
file_list = []
|
||
for root, dirs, files in os.walk(start_path):
|
||
# 通过修改dirs可以影响os.walk的遍历,从而排除特定目录
|
||
dirs[:] = [d for d in dirs if d not in exclude_dirs]
|
||
for file in files:
|
||
if any(file.endswith(ext) for ext in extensions):
|
||
file_list.append(os.path.join(root, file))
|
||
return file_list
|
||
|
||
def clean_text(text):
|
||
"""
|
||
清理文本,移除所有非XML兼容字符。
|
||
|
||
:param text: 原始文本
|
||
:return: 清理后的文本
|
||
"""
|
||
return re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', text)
|
||
|
||
def read_files(file_list):
|
||
"""
|
||
读取文件内容。
|
||
|
||
:param file_list: 文件路径列表
|
||
:return: 文件内容字典
|
||
"""
|
||
content_dict = {}
|
||
for file_path in file_list:
|
||
with open(file_path, 'r', encoding='utf-8') as file:
|
||
content = file.read()
|
||
content = clean_text(content) # 清理文本内容
|
||
content_dict[file_path] = content
|
||
return content_dict
|
||
|
||
def save_to_docx(content_dict, output_file):
|
||
"""
|
||
将文件内容字典保存到 DOCX 文件。
|
||
|
||
:param content_dict: 文件内容字典
|
||
:param output_file: 输出的 DOCX 文件名
|
||
"""
|
||
doc = Document()
|
||
for file_path, content in content_dict.items():
|
||
try:
|
||
doc.add_heading(file_path, level=1)
|
||
doc.add_paragraph(content)
|
||
doc.add_page_break() # 添加分页符
|
||
except ValueError as e:
|
||
print(f"Error processing file {file_path}: {e}")
|
||
doc.save(output_file)
|
||
|
||
if __name__ == "__main__":
|
||
# 需要遍历的目录
|
||
directory = '/Users/lishunqin/Desktop/study/pychram project/SumKim_upload/SumKim_upload_system'
|
||
|
||
# 需要排除的目录
|
||
exclude_dirs = ['myenv', 'flask_session','venv']
|
||
|
||
# 需要查找的文件扩展名
|
||
extensions = ['.py', '.html', '.env', '.css', '.js']
|
||
|
||
# 查找文件
|
||
files = find_files(directory, extensions, exclude_dirs)
|
||
|
||
# 读取文件内容
|
||
content_dict = read_files(files)
|
||
|
||
# 输出 DOCX 文件名
|
||
output_docx = 'output_files_content.docx'
|
||
|
||
# 保存到 DOCX 文件
|
||
save_to_docx(content_dict, output_docx)
|
||
|
||
print(f"找到 {len(files)} 个文件,并保存了内容到 {output_docx}") |