PicoBot/convert_pdf.py
oudecheng 37f417007e feat(gateway): 添加配置管理功能支持敏感信息脱敏
- 实现 API 端点 /api/config 用于获取和保存配置
- 添加配置信息脱敏功能,保护 API 密钥等敏感数据
- 集成配置验证逻辑,确保时区等参数有效性
- 在前端添加完整的配置管理页面界面
- 实现配置项的动态编辑和保存功能
- 添加连接设置功能用于 WebSocket 连接配置
- 提供多标签页界面分别管理不同配置模块
- 实现配置变更后的实时预览和保存确认
2026-06-15 17:22:32 +08:00

109 lines
3.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import fitz
from docx import Document
from docx.shared import Pt, Cm, RGBColor
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
pdf_path = r'C:\Users\qwer\.picobot\media\ws\aa56c052-ea10-4bc1-aed4-7d06770b6fd9_夜读 _ 明白了这4点就不难养出有主体性的孩子.pdf'
output_path = r'C:\Users\qwer\.picobot\media\夜读_明白了这4点_就不难养出有主体性的孩子.docx'
pdf_doc = fitz.open(pdf_path)
doc = Document()
# 页面边距
for section in doc.sections:
section.top_margin = Cm(2.54)
section.bottom_margin = Cm(2.54)
section.left_margin = Cm(3.18)
section.right_margin = Cm(3.18)
# 正文样式
style = doc.styles['Normal']
style.font.name = '宋体'
style.font.size = Pt(12)
style.paragraph_format.line_spacing = 1.5
style.paragraph_format.first_line_indent = Pt(24)
def add_run(paragraph, text, bold=False, size=None, color=None, italic=False, font_name=None):
run = paragraph.add_run(text)
run.bold = bold
if size: run.font.size = Pt(size)
if color: run.font.color.rgb = RGBColor(*color)
run.italic = italic
if font_name: run.font.name = font_name
return run
# 收集所有文本
full_text = []
for i, page in enumerate(pdf_doc):
text = page.get_text().strip()
if text:
full_text.append(text)
all_text = '\n'.join(full_text)
lines = [l.strip() for l in all_text.split('\n') if l.strip()]
# 定义段落标记
sections_headers = ['塑教育,提倡积极养育', '懂互动,给予丰盈幸福',
'有边界,养出人生底气', '稳情绪,才能赢得孩子']
skip_lines = ['南方都市报电商官方账号。', '南都甄选', '公众号']
first = True
for line in lines:
# 跳过广告行
if line in skip_lines:
continue
# 主标题
if first:
p = doc.add_paragraph()
p.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
p.paragraph_format.first_line_indent = Pt(0)
p.paragraph_format.space_after = Pt(12)
add_run(p, line, bold=True, size=22, font_name='黑体')
first = False
# 引用句(引号开头结尾、较短)
elif (line.startswith('"') and line.endswith('"')) or \
(line.startswith('"') and line.endswith('"') and len(line) < 60):
p = doc.add_paragraph()
p.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
p.paragraph_format.first_line_indent = Pt(0)
p.paragraph_format.space_before = Pt(6)
p.paragraph_format.space_after = Pt(6)
add_run(p, line, italic=True, size=12, color=(102, 102, 102))
# 日期来源
elif line in ['2026年6月14日 22:28 广东', '南方都市报']:
p = doc.add_paragraph()
p.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
p.paragraph_format.first_line_indent = Pt(0)
add_run(p, line, size=10.5, color=(128, 128, 128))
# 4个小标题
elif line in sections_headers:
p = doc.add_paragraph()
p.paragraph_format.first_line_indent = Pt(0)
p.paragraph_format.space_before = Pt(18)
p.paragraph_format.space_after = Pt(6)
add_run(p, line, bold=True, size=15, font_name='黑体')
# 作者信息
elif any(line.startswith(x) for x in ['作者:', '统筹:', '图片:', '投稿邮箱:']):
p = doc.add_paragraph()
p.paragraph_format.first_line_indent = Pt(0)
add_run(p, line, size=10.5, color=(128, 128, 128))
# 末尾信息
elif '转载自' in line or '把世界当成' in line:
p = doc.add_paragraph()
p.paragraph_format.first_line_indent = Pt(0)
p.paragraph_format.space_before = Pt(6)
add_run(p, line, size=10.5, color=(102, 102, 102))
else:
doc.add_paragraph(line)
pdf_doc.close()
doc.save(output_path)
print('转换完成!')