sam-docs/sam/docs/contracts/scripts/extract_to_markdown.py

#!/usr/bin/env python3
"""
DOCX → Markdown 추출 스크립트

4개 전자계약 DOCX 파일을 Markdown으로 변환한다.
- 서비스이용계약서: Heading 스타일 기반 매핑
- 나머지 3개: Bold 런 + 패턴 매칭으로 구조 유추
"""

import re
import sys
from datetime import date
from pathlib import Path

from docx import Document

# 경로 설정
BASE_DIR = Path(__file__).resolve().parent.parent
DOCX_DIR = BASE_DIR / "docx"
MD_DIR = BASE_DIR / "markdown"

# DOCX → Markdown 매핑
FILE_MAP = {
    "01_고객_서비스이용계약서_v4_0_전자서명용.docx": {
        "output": "01-service-agreement.md",
        "title": "고객사 서비스 이용계약서",
        "type": "styled",
    },
    "비밀유지서약서.docx": {
        "output": "02-nda.md",
        "title": "비밀유지서약서 (NDA)",
        "type": "pattern",
    },
    "영업파트너 위촉계약서.docx": {
        "output": "03-partner-agreement.md",
        "title": "영업파트너 위촉계약서",
        "type": "pattern",
    },
    "영업파트너 위촉계약서(단체용).docx": {
        "output": "04-partner-agreement-group.md",
        "title": "영업파트너 위촉계약서 (단체용)",
        "type": "pattern",
    },
}


def table_to_markdown(table):
    """DOCX 테이블을 Markdown 테이블로 변환"""
    rows = []
    for row in table.rows:
        cells = [cell.text.strip().replace("\n", " ") for cell in row.cells]
        rows.append(cells)

    if not rows:
        return ""

    lines = []
    # 헤더
    lines.append("| " + " | ".join(rows[0]) + " |")
    lines.append("| " + " | ".join(["---"] * len(rows[0])) + " |")
    # 본문
    for row in rows[1:]:
        # 셀 개수 맞추기
        while len(row) < len(rows[0]):
            row.append("")
        lines.append("| " + " | ".join(row[: len(rows[0])]) + " |")

    return "\n".join(lines)


def get_paragraph_heading_level_styled(para):
    """스타일 기반 문서의 헤딩 레벨 판별 (서비스이용계약서)"""
    style = para.style.name if para.style else ""

    if style == "Heading 1":
        return 1
    elif style == "Heading 2":
        return 2
    elif style == "Heading 3":
        return 3

    return 0


def get_paragraph_heading_level_pattern(para):
    """패턴 매칭 기반 문서의 헤딩 레벨 판별 (비밀유지서약서, 영업파트너 위촉계약서)"""
    text = para.text.strip()
    has_bold = any(r.bold for r in para.runs if r.bold)

    if not text or not has_bold:
        return 0

    # "제X조" 패턴 → ## (h2)
    if re.match(r"^<?[  ]*제\d+조", text):
        return 2

    # "X.X " 패턴 (소제목) → ### (h3)
    if re.match(r"^\d+\.\d+\s", text):
        return 3

    # 문서 제목 (첫 번째 bold 텍스트)
    if re.match(r"^<?\s*(영업파트너|비밀유지서약서|Sales Partner)", text):
        return 1

    return 0


def is_list_item(para, doc_type):
    """리스트 아이템인지 판별"""
    text = para.text.strip()
    if not text:
        return False

    if doc_type == "styled":
        style = para.style.name if para.style else ""
        return style == "Compact"

    # pattern 기반: bold가 아닌 일반 텍스트이면서 제X조나 X.X 패턴이 아닌 것
    has_bold = any(r.bold for r in para.runs if r.bold)
    if not has_bold and not re.match(r"^(제\d+조|<?|계약 당사자|\[)", text):
        return True

    return False


def extract_styled_doc(doc, file_info):
    """스타일 기반 문서 추출 (서비스이용계약서)"""
    lines = []
    table_positions = {}

    # 테이블 위치 매핑: 문단 인덱스 기준으로 테이블이 어디에 삽입되는지 추적
    body = doc.element.body
    table_idx = 0
    para_idx = 0
    for child in body:
        tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
        if tag == "p":
            para_idx += 1
        elif tag == "tbl":
            table_positions[para_idx] = table_idx
            table_idx += 1

    para_idx = 0
    for child in body:
        tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag

        if tag == "p":
            para = doc.paragraphs[para_idx]
            para_idx += 1
            text = para.text.strip()

            if not text:
                lines.append("")
                continue

            style = para.style.name if para.style else ""
            level = get_paragraph_heading_level_styled(para)

            if level > 0:
                lines.append("")
                lines.append(f"{'#' * level} {text}")
                lines.append("")
            elif style == "Compact":
                # Bold 런이 있으면 강조 리스트
                has_bold = any(r.bold for r in para.runs if r.bold)
                if has_bold:
                    # Bold 부분과 일반 부분 분리
                    parts = []
                    for run in para.runs:
                        if run.bold:
                            parts.append(f"**{run.text}**")
                        else:
                            parts.append(run.text)
                    combined = "".join(parts)
                    lines.append(f"- {combined}")
                else:
                    # 들여쓰기된 하위 항목
                    lines.append(f"  - {text}")
            elif style in ("Body Text", "First Paragraph"):
                # 본문 텍스트
                if text.startswith("⚠️") or text.startswith("✅") or text.startswith("❌"):
                    lines.append("")
                    lines.append(f"> {text}")
                    lines.append("")
                else:
                    lines.append(text)
            else:
                lines.append(text)

        elif tag == "tbl":
            if table_idx <= len(doc.tables):
                current_table_idx = sum(
                    1
                    for c in list(body)[: list(body).index(child)]
                    if (c.tag.split("}")[-1] if "}" in c.tag else c.tag) == "tbl"
                )
                if current_table_idx < len(doc.tables):
                    lines.append("")
                    lines.append(table_to_markdown(doc.tables[current_table_idx]))
                    lines.append("")

    return "\n".join(lines)


def extract_pattern_doc(doc, file_info):
    """패턴 매칭 기반 문서 추출 (비밀유지서약서, 영업파트너 위촉계약서)"""
    lines = []

    body = doc.element.body
    para_idx = 0

    for child in body:
        tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag

        if tag == "p":
            para = doc.paragraphs[para_idx]
            para_idx += 1
            text = para.text.strip()

            if not text:
                lines.append("")
                continue

            level = get_paragraph_heading_level_pattern(para)
            has_bold = any(r.bold for r in para.runs if r.bold)

            if level > 0:
                lines.append("")
                # 제목에서 < > 제거
                clean_text = re.sub(r"^<\s*|\s*>$", "", text).strip()
                lines.append(f"{'#' * level} {clean_text}")
                lines.append("")
            elif has_bold:
                # Bold 텍스트는 강조 처리
                parts = []
                for run in para.runs:
                    if run.bold:
                        parts.append(f"**{run.text}**")
                    else:
                        parts.append(run.text)
                combined = "".join(parts)

                # (1), (2) 같은 번호 패턴
                if re.match(r"^\*\*\(\d+\)", combined):
                    lines.append(f"- {combined}")
                # "예시 N:", "Phase N:" 같은 패턴
                elif re.match(r"^\*\*(예시|Phase|별첨)\s", combined):
                    lines.append("")
                    lines.append(f"#### {text}")
                    lines.append("")
                else:
                    lines.append(f"- {combined}")
            else:
                # 일반 텍스트
                # 빈칸 양식 (___) 유지
                if "___" in text:
                    lines.append(text)
                elif re.match(r"^(이메일|전화|주소|상호|대표|사업자|주민|연락처|날짜):", text):
                    lines.append(f"- {text}")
                else:
                    lines.append(f"  - {text}")

        elif tag == "tbl":
            current_table_idx = sum(
                1
                for c in list(body)[: list(body).index(child)]
                if (c.tag.split("}")[-1] if "}" in c.tag else c.tag) == "tbl"
            )
            if current_table_idx < len(doc.tables):
                lines.append("")
                lines.append(table_to_markdown(doc.tables[current_table_idx]))
                lines.append("")

    return "\n".join(lines)


def add_frontmatter(content, file_info, docx_name):
    """YAML 프론트매터 추가"""
    frontmatter = f"""---
title: "{file_info['title']}"
version: "v4.0"
date: "{date.today().isoformat()}"
docx_file: "{docx_name}"
---
"""
    return frontmatter + "\n" + content


def extract_file(docx_name, file_info):
    """단일 DOCX 파일 추출"""
    docx_path = DOCX_DIR / docx_name
    if not docx_path.exists():
        print(f"  [SKIP] {docx_name} - 파일 없음")
        return False

    doc = Document(str(docx_path))

    if file_info["type"] == "styled":
        content = extract_styled_doc(doc, file_info)
    else:
        content = extract_pattern_doc(doc, file_info)

    # 프론트매터 추가
    content = add_frontmatter(content, file_info, docx_name)

    # 연속 빈 줄 정리 (3줄 이상 → 2줄로)
    content = re.sub(r"\n{3,}", "\n\n", content)

    # 파일 저장
    output_path = MD_DIR / file_info["output"]
    output_path.write_text(content, encoding="utf-8")
    print(f"  [OK] {docx_name} → {file_info['output']}")
    return True


def main():
    print("DOCX → Markdown 추출 시작")
    print(f"  DOCX 디렉토리: {DOCX_DIR}")
    print(f"  출력 디렉토리: {MD_DIR}")
    print()

    MD_DIR.mkdir(parents=True, exist_ok=True)

    success = 0
    for docx_name, file_info in FILE_MAP.items():
        if extract_file(docx_name, file_info):
            success += 1

    print(f"\n완료: {success}/{len(FILE_MAP)} 파일 변환됨")
    return 0 if success == len(FILE_MAP) else 1


if __name__ == "__main__":
    sys.exit(main())