claude-code/extract_docx.py

49 lines
1.3 KiB
Python
Raw Normal View History

2026-02-27 13:45:37 +00:00
#!/usr/bin/env python3
import zipfile
from xml.etree import ElementTree
import os
docx_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '【Top.One】產品文檔.docx')
output_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'topone_extracted.txt')
W = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
with zipfile.ZipFile(docx_path) as z:
with z.open('word/document.xml') as f:
content = f.read()
tree = ElementTree.fromstring(content)
paras = tree.findall(f'.//{W}p')
lines = []
for para in paras:
texts = []
for t in para.iter(f'{W}t'):
if t.text:
texts.append(t.text)
line = ''.join(texts)
pPr = para.find(f'{W}pPr')
style = ''
if pPr is not None:
pStyle = pPr.find(f'{W}pStyle')
if pStyle is not None:
style = pStyle.get(f'{W}val', '')
prefix = ''
if style == 'Heading1' or style == '1':
prefix = '# '
elif style == 'Heading2' or style == '2':
prefix = '## '
elif style == 'Heading3' or style == '3':
prefix = '### '
elif style == 'Heading4' or style == '4':
prefix = '#### '
lines.append(f'{prefix}{line}')
with open(output_path, 'w', encoding='utf-8') as f:
f.write('\n'.join(lines))
print(f'Wrote {len(lines)} lines to {output_path}')