import zipfile from xml.etree import ElementTree import sys W = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' docx_path = sys.argv[1] output_path = sys.argv[2] with zipfile.ZipFile(docx_path) as z: with z.open('word/document.xml') as f: content = f.read() tree = ElementTree.fromstring(content) paras = tree.findall(f'.//{W}p') lines = [] for para in paras: texts = [] for t in para.iter(f'{W}t'): if t.text: texts.append(t.text) line = ''.join(texts) pPr = para.find(f'{W}pPr') style = '' numId = '' ilvl = '0' if pPr is not None: pStyle = pPr.find(f'{W}pStyle') if pStyle is not None: style = pStyle.get(f'{W}val', '') numPr = pPr.find(f'{W}numPr') if numPr is not None: ilvlEl = numPr.find(f'{W}ilvl') numIdEl = numPr.find(f'{W}numId') if ilvlEl is not None: ilvl = ilvlEl.get(f'{W}val', '0') if numIdEl is not None: numId = numIdEl.get(f'{W}val', '') prefix = '' if 'Heading1' in style or style == '1': prefix = '# ' elif 'Heading2' in style or style == '2': prefix = '## ' elif 'Heading3' in style or style == '3': prefix = '### ' elif 'Heading4' in style or style == '4': prefix = '#### ' elif numId: indent = ' ' * int(ilvl) prefix = indent + '- ' lines.append(f'{prefix}{line}') with open(output_path, 'w', encoding='utf-8') as f: f.write('\n'.join(lines)) sys.stderr.write(f'Wrote {len(lines)} lines\n')