import fitz from docx import Document from docx.shared import Pt, Cm, RGBColor from docx.enum.text import WD_PARAGRAPH_ALIGNMENT pdf_path = r'C:\Users\qwer\.picobot\media\ws\aa56c052-ea10-4bc1-aed4-7d06770b6fd9_夜读 _ 明白了这4点,就不难养出有主体性的孩子.pdf' output_path = r'C:\Users\qwer\.picobot\media\夜读_明白了这4点_就不难养出有主体性的孩子.docx' pdf_doc = fitz.open(pdf_path) doc = Document() # 页面边距 for section in doc.sections: section.top_margin = Cm(2.54) section.bottom_margin = Cm(2.54) section.left_margin = Cm(3.18) section.right_margin = Cm(3.18) # 正文样式 style = doc.styles['Normal'] style.font.name = '宋体' style.font.size = Pt(12) style.paragraph_format.line_spacing = 1.5 style.paragraph_format.first_line_indent = Pt(24) def add_run(paragraph, text, bold=False, size=None, color=None, italic=False, font_name=None): run = paragraph.add_run(text) run.bold = bold if size: run.font.size = Pt(size) if color: run.font.color.rgb = RGBColor(*color) run.italic = italic if font_name: run.font.name = font_name return run # 收集所有文本 full_text = [] for i, page in enumerate(pdf_doc): text = page.get_text().strip() if text: full_text.append(text) all_text = '\n'.join(full_text) lines = [l.strip() for l in all_text.split('\n') if l.strip()] # 定义段落标记 sections_headers = ['塑教育,提倡积极养育', '懂互动,给予丰盈幸福', '有边界,养出人生底气', '稳情绪,才能赢得孩子'] skip_lines = ['南方都市报电商官方账号。', '南都甄选', '公众号'] first = True for line in lines: # 跳过广告行 if line in skip_lines: continue # 主标题 if first: p = doc.add_paragraph() p.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER p.paragraph_format.first_line_indent = Pt(0) p.paragraph_format.space_after = Pt(12) add_run(p, line, bold=True, size=22, font_name='黑体') first = False # 引用句(引号开头结尾、较短) elif (line.startswith('"') and line.endswith('"')) or \ (line.startswith('"') and line.endswith('"') and len(line) < 60): p = doc.add_paragraph() p.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER p.paragraph_format.first_line_indent = Pt(0) p.paragraph_format.space_before = Pt(6) p.paragraph_format.space_after = Pt(6) add_run(p, line, italic=True, size=12, color=(102, 102, 102)) # 日期来源 elif line in ['2026年6月14日 22:28 广东', '南方都市报']: p = doc.add_paragraph() p.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT p.paragraph_format.first_line_indent = Pt(0) add_run(p, line, size=10.5, color=(128, 128, 128)) # 4个小标题 elif line in sections_headers: p = doc.add_paragraph() p.paragraph_format.first_line_indent = Pt(0) p.paragraph_format.space_before = Pt(18) p.paragraph_format.space_after = Pt(6) add_run(p, line, bold=True, size=15, font_name='黑体') # 作者信息 elif any(line.startswith(x) for x in ['作者:', '统筹:', '图片:', '投稿邮箱:']): p = doc.add_paragraph() p.paragraph_format.first_line_indent = Pt(0) add_run(p, line, size=10.5, color=(128, 128, 128)) # 末尾信息 elif '转载自' in line or '把世界当成' in line: p = doc.add_paragraph() p.paragraph_format.first_line_indent = Pt(0) p.paragraph_format.space_before = Pt(6) add_run(p, line, size=10.5, color=(102, 102, 102)) else: doc.add_paragraph(line) pdf_doc.close() doc.save(output_path) print('转换完成!')