#!/usr/bin/env python3 import zipfile from xml.etree import ElementTree import os docx_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '【Top.One】產品文檔.docx') output_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'topone_extracted.txt') W = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' with zipfile.ZipFile(docx_path) as z: with z.open('word/document.xml') as f: content = f.read() tree = ElementTree.fromstring(content) paras = tree.findall(f'.//{W}p') lines = [] for para in paras: texts = [] for t in para.iter(f'{W}t'): if t.text: texts.append(t.text) line = ''.join(texts) pPr = para.find(f'{W}pPr') style = '' if pPr is not None: pStyle = pPr.find(f'{W}pStyle') if pStyle is not None: style = pStyle.get(f'{W}val', '') prefix = '' if style == 'Heading1' or style == '1': prefix = '# ' elif style == 'Heading2' or style == '2': prefix = '## ' elif style == 'Heading3' or style == '3': prefix = '### ' elif style == 'Heading4' or style == '4': prefix = '#### ' lines.append(f'{prefix}{line}') with open(output_path, 'w', encoding='utf-8') as f: f.write('\n'.join(lines)) print(f'Wrote {len(lines)} lines to {output_path}')