sam-docs/sam/docs/contracts/scripts/sync_check.py

#!/usr/bin/env python3
"""
DOCX ↔ Markdown 동기화 검증 스크립트

DOCX에서 텍스트를 추출하고 Markdown 파일의 텍스트와 비교하여
불일치 항목을 리포트한다.
"""

import difflib
import re
import sys
from pathlib import Path

from docx import Document

BASE_DIR = Path(__file__).resolve().parent.parent
DOCX_DIR = BASE_DIR / "docx"
MD_DIR = BASE_DIR / "markdown"

# DOCX → Markdown 파일 매핑
FILE_MAP = {
    "01_고객_서비스이용계약서_v4_0_전자서명용.docx": "01-service-agreement.md",
    "비밀유지서약서.docx": "02-nda.md",
    "영업파트너 위촉계약서.docx": "03-partner-agreement.md",
    "영업파트너 위촉계약서(단체용).docx": "04-partner-agreement-group.md",
}


def extract_text_from_docx(docx_path):
    """DOCX에서 순수 텍스트만 추출 (개정이력 테이블 제외, 인터리빙 방식)"""
    doc = Document(str(docx_path))
    lines = []

    from docx.oxml.ns import qn as _qn

    body = doc.element.body
    para_idx = 0
    table_idx = 0
    skip_revision = False

    for child in body:
        tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag

        if tag == "p":
            if para_idx < len(doc.paragraphs):
                text = doc.paragraphs[para_idx].text.strip()
                para_idx += 1

                if "개정이력" in text:
                    skip_revision = True
                    continue
                if text:
                    skip_revision = False
                    lines.append(text)

        elif tag == "tbl":
            if table_idx < len(doc.tables):
                table = doc.tables[table_idx]
                table_idx += 1

                # 개정이력 테이블 건너뛰기
                if len(table.rows) > 0:
                    first_row_text = [cell.text.strip() for cell in table.rows[0].cells]
                    if "버전" in first_row_text and "날짜" in first_row_text:
                        skip_revision = False
                        continue

                if skip_revision:
                    skip_revision = False
                    continue

                for row in table.rows:
                    row_text = " | ".join(cell.text.strip() for cell in row.cells)
                    if row_text.strip():
                        lines.append(row_text)

    return lines


def extract_text_from_markdown(md_path):
    """Markdown에서 순수 텍스트만 추출 (프론트매터, 마크업 제거)"""
    content = md_path.read_text(encoding="utf-8")
    lines = []

    in_frontmatter = False
    in_table = False

    for line in content.split("\n"):
        stripped = line.strip()

        # YAML 프론트매터 건너뛰기
        if stripped == "---":
            in_frontmatter = not in_frontmatter
            continue
        if in_frontmatter:
            continue

        # 빈 줄 건너뛰기
        if not stripped:
            in_table = False
            continue

        # Markdown 마크업 제거
        text = stripped

        # 헤딩 마크업 제거
        text = re.sub(r"^#{1,6}\s+", "", text)

        # 리스트 마크업 제거
        text = re.sub(r"^\s*[-*+]\s+", "", text)

        # Bold/Italic 마크업 제거
        text = re.sub(r"\*\*(.+?)\*\*", r"\1", text)
        text = re.sub(r"\*(.+?)\*", r"\1", text)

        # 블록인용 제거
        text = re.sub(r"^>\s*", "", text)

        # 테이블 구분선 건너뛰기
        if re.match(r"^\|[\s\-|]+\|$", text):
            continue

        # 테이블 행
        if text.startswith("|") and text.endswith("|"):
            # 파이프 제거하고 셀 텍스트 추출
            cells = [c.strip() for c in text.strip("|").split("|")]
            text = " | ".join(cells)

        text = text.strip()
        if text:
            lines.append(text)

    return lines


def normalize_text(text):
    """비교를 위한 텍스트 정규화"""
    # 공백 정규화
    text = re.sub(r"\s+", " ", text).strip()
    # 특수문자 정규화
    text = text.replace("\u00a0", " ")  # non-breaking space
    text = text.replace("\u3000", " ")  # ideographic space
    # 언더스코어 빈칸 정규화
    text = re.sub(r"_{3,}", "___", text)
    return text


def compare_documents(docx_name, md_name):
    """두 문서의 텍스트를 비교"""
    docx_path = DOCX_DIR / docx_name
    md_path = MD_DIR / md_name

    if not docx_path.exists():
        return {"status": "error", "message": f"DOCX 파일 없음: {docx_name}"}
    if not md_path.exists():
        return {"status": "error", "message": f"Markdown 파일 없음: {md_name}"}

    docx_lines = [normalize_text(l) for l in extract_text_from_docx(docx_path) if l.strip()]
    md_lines = [normalize_text(l) for l in extract_text_from_markdown(md_path) if l.strip()]

    # difflib로 비교
    matcher = difflib.SequenceMatcher(None, docx_lines, md_lines)
    ratio = matcher.ratio()

    # 차이점 추출
    diffs = []
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == "equal":
            continue
        elif tag == "replace":
            for idx in range(max(i2 - i1, j2 - j1)):
                docx_text = docx_lines[i1 + idx] if i1 + idx < i2 else "(없음)"
                md_text = md_lines[j1 + idx] if j1 + idx < j2 else "(없음)"
                diffs.append({
                    "type": "변경",
                    "docx": docx_text[:80],
                    "markdown": md_text[:80],
                })
        elif tag == "delete":
            for idx in range(i1, i2):
                diffs.append({
                    "type": "DOCX에만 존재",
                    "docx": docx_lines[idx][:80],
                    "markdown": "-",
                })
        elif tag == "insert":
            for idx in range(j1, j2):
                diffs.append({
                    "type": "Markdown에만 존재",
                    "docx": "-",
                    "markdown": md_lines[idx][:80],
                })

    return {
        "status": "ok",
        "similarity": round(ratio * 100, 1),
        "docx_lines": len(docx_lines),
        "md_lines": len(md_lines),
        "diff_count": len(diffs),
        "diffs": diffs[:20],  # 상위 20개만
    }


def main():
    print("=" * 70)
    print("DOCX ↔ Markdown 동기화 검증")
    print("=" * 70)

    all_ok = True

    for docx_name, md_name in FILE_MAP.items():
        print(f"\n{'─' * 50}")
        print(f"문서: {docx_name}")
        print(f"  ↔ {md_name}")
        print(f"{'─' * 50}")

        result = compare_documents(docx_name, md_name)

        if result["status"] == "error":
            print(f"  [ERROR] {result['message']}")
            all_ok = False
            continue

        similarity = result["similarity"]
        status_icon = "OK" if similarity >= 80 else "WARN" if similarity >= 60 else "FAIL"

        print(f"  유사도: {similarity}% [{status_icon}]")
        print(f"  DOCX 라인: {result['docx_lines']}")
        print(f"  Markdown 라인: {result['md_lines']}")
        print(f"  차이점: {result['diff_count']}개")

        if result["diffs"]:
            print(f"\n  주요 차이점 (상위 {min(len(result['diffs']), 10)}개):")
            for i, diff in enumerate(result["diffs"][:10]):
                print(f"    [{diff['type']}]")
                if diff["docx"] != "-":
                    print(f"      DOCX: {diff['docx']}")
                if diff["markdown"] != "-":
                    print(f"      MD:   {diff['markdown']}")

        if similarity < 80:
            all_ok = False

    print(f"\n{'=' * 70}")
    if all_ok:
        print("결과: 모든 문서 동기화 상태 양호")
    else:
        print("결과: 일부 문서에서 불일치 발견 - 확인 필요")
    print(f"{'=' * 70}")

    return 0 if all_ok else 1


if __name__ == "__main__":
    sys.exit(main())