superlishunqin 68b99755ec ALL
2024-11-14 15:46:37 +08:00

86 lines
2.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import re
from docx import Document
def find_files(start_path, extensions, exclude_dirs):
"""
遍历目录并找到指定扩展名的文件,不包括特定目录。
:param start_path: 起始路径
:param extensions: 需要查找的文件扩展名列表
:param exclude_dirs: 需要排除的子目录列表
:return: 文件路径列表
"""
file_list = []
for root, dirs, files in os.walk(start_path):
# 通过修改dirs可以影响os.walk的遍历从而排除特定目录
dirs[:] = [d for d in dirs if d not in exclude_dirs]
for file in files:
if any(file.endswith(ext) for ext in extensions):
file_list.append(os.path.join(root, file))
return file_list
def clean_text(text):
"""
清理文本移除所有非XML兼容字符。
:param text: 原始文本
:return: 清理后的文本
"""
return re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', text)
def read_files(file_list):
"""
读取文件内容。
:param file_list: 文件路径列表
:return: 文件内容字典
"""
content_dict = {}
for file_path in file_list:
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
content = clean_text(content) # 清理文本内容
content_dict[file_path] = content
return content_dict
def save_to_docx(content_dict, output_file):
"""
将文件内容字典保存到 DOCX 文件。
:param content_dict: 文件内容字典
:param output_file: 输出的 DOCX 文件名
"""
doc = Document()
for file_path, content in content_dict.items():
try:
doc.add_heading(file_path, level=1)
doc.add_paragraph(content)
doc.add_page_break() # 添加分页符
except ValueError as e:
print(f"Error processing file {file_path}: {e}")
doc.save(output_file)
if __name__ == "__main__":
# 需要遍历的目录
directory = '/Users/lishunqin/Desktop/study/pychram project/SumKim_upload/SumKim_upload_system'
# 需要排除的目录
exclude_dirs = ['myenv', 'flask_session','venv']
# 需要查找的文件扩展名
extensions = ['.py', '.html', '.env', '.css', '.js']
# 查找文件
files = find_files(directory, extensions, exclude_dirs)
# 读取文件内容
content_dict = read_files(files)
# 输出 DOCX 文件名
output_docx = 'output_files_content.docx'
# 保存到 DOCX 文件
save_to_docx(content_dict, output_docx)
print(f"找到 {len(files)} 个文件,并保存了内容到 {output_docx}")