claude-code/extract.py

60 lines
1.6 KiB
Python
Raw Normal View History

2026-02-27 13:45:37 +00:00
import zipfile
from xml.etree import ElementTree
import sys
W = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
docx_path = sys.argv[1]
output_path = sys.argv[2]
with zipfile.ZipFile(docx_path) as z:
with z.open('word/document.xml') as f:
content = f.read()
tree = ElementTree.fromstring(content)
paras = tree.findall(f'.//{W}p')
lines = []
for para in paras:
texts = []
for t in para.iter(f'{W}t'):
if t.text:
texts.append(t.text)
line = ''.join(texts)
pPr = para.find(f'{W}pPr')
style = ''
numId = ''
ilvl = '0'
if pPr is not None:
pStyle = pPr.find(f'{W}pStyle')
if pStyle is not None:
style = pStyle.get(f'{W}val', '')
numPr = pPr.find(f'{W}numPr')
if numPr is not None:
ilvlEl = numPr.find(f'{W}ilvl')
numIdEl = numPr.find(f'{W}numId')
if ilvlEl is not None:
ilvl = ilvlEl.get(f'{W}val', '0')
if numIdEl is not None:
numId = numIdEl.get(f'{W}val', '')
prefix = ''
if 'Heading1' in style or style == '1':
prefix = '# '
elif 'Heading2' in style or style == '2':
prefix = '## '
elif 'Heading3' in style or style == '3':
prefix = '### '
elif 'Heading4' in style or style == '4':
prefix = '#### '
elif numId:
indent = ' ' * int(ilvl)
prefix = indent + '- '
lines.append(f'{prefix}{line}')
with open(output_path, 'w', encoding='utf-8') as f:
f.write('\n'.join(lines))
sys.stderr.write(f'Wrote {len(lines)} lines\n')