PicoBot/convert_pdf.py

import fitz
from docx import Document
from docx.shared import Pt, Cm, RGBColor
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT

pdf_path = r'C:\Users\qwer\.picobot\media\ws\aa56c052-ea10-4bc1-aed4-7d06770b6fd9_夜读 _ 明白了这4点，就不难养出有主体性的孩子.pdf'
output_path = r'C:\Users\qwer\.picobot\media\夜读_明白了这4点_就不难养出有主体性的孩子.docx'

pdf_doc = fitz.open(pdf_path)
doc = Document()

# 页面边距
for section in doc.sections:
    section.top_margin = Cm(2.54)
    section.bottom_margin = Cm(2.54)
    section.left_margin = Cm(3.18)
    section.right_margin = Cm(3.18)

# 正文样式
style = doc.styles['Normal']
style.font.name = '宋体'
style.font.size = Pt(12)
style.paragraph_format.line_spacing = 1.5
style.paragraph_format.first_line_indent = Pt(24)

def add_run(paragraph, text, bold=False, size=None, color=None, italic=False, font_name=None):
    run = paragraph.add_run(text)
    run.bold = bold
    if size: run.font.size = Pt(size)
    if color: run.font.color.rgb = RGBColor(*color)
    run.italic = italic
    if font_name: run.font.name = font_name
    return run

# 收集所有文本
full_text = []
for i, page in enumerate(pdf_doc):
    text = page.get_text().strip()
    if text:
        full_text.append(text)

all_text = '\n'.join(full_text)
lines = [l.strip() for l in all_text.split('\n') if l.strip()]

# 定义段落标记
sections_headers = ['塑教育，提倡积极养育', '懂互动，给予丰盈幸福',
                    '有边界，养出人生底气', '稳情绪，才能赢得孩子']
skip_lines = ['南方都市报电商官方账号。', '南都甄选', '公众号']

first = True
for line in lines:
    # 跳过广告行
    if line in skip_lines:
        continue

    # 主标题
    if first:
        p = doc.add_paragraph()
        p.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
        p.paragraph_format.first_line_indent = Pt(0)
        p.paragraph_format.space_after = Pt(12)
        add_run(p, line, bold=True, size=22, font_name='黑体')
        first = False

    # 引用句（引号开头结尾、较短）
    elif (line.startswith('"') and line.endswith('"')) or \
         (line.startswith('"') and line.endswith('"') and len(line) < 60):
        p = doc.add_paragraph()
        p.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
        p.paragraph_format.first_line_indent = Pt(0)
        p.paragraph_format.space_before = Pt(6)
        p.paragraph_format.space_after = Pt(6)
        add_run(p, line, italic=True, size=12, color=(102, 102, 102))

    # 日期来源
    elif line in ['2026年6月14日 22:28 广东', '南方都市报']:
        p = doc.add_paragraph()
        p.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
        p.paragraph_format.first_line_indent = Pt(0)
        add_run(p, line, size=10.5, color=(128, 128, 128))

    # 4个小标题
    elif line in sections_headers:
        p = doc.add_paragraph()
        p.paragraph_format.first_line_indent = Pt(0)
        p.paragraph_format.space_before = Pt(18)
        p.paragraph_format.space_after = Pt(6)
        add_run(p, line, bold=True, size=15, font_name='黑体')

    # 作者信息
    elif any(line.startswith(x) for x in ['作者：', '统筹：', '图片：', '投稿邮箱：']):
        p = doc.add_paragraph()
        p.paragraph_format.first_line_indent = Pt(0)
        add_run(p, line, size=10.5, color=(128, 128, 128))

    # 末尾信息
    elif '转载自' in line or '把世界当成' in line:
        p = doc.add_paragraph()
        p.paragraph_format.first_line_indent = Pt(0)
        p.paragraph_format.space_before = Pt(6)
        add_run(p, line, size=10.5, color=(102, 102, 102))

    else:
        doc.add_paragraph(line)

pdf_doc.close()
doc.save(output_path)
print('转换完成！')