sam-docs/sam/docs/contracts/scripts/sync_check.py

#!/usr/bin/env python3
"""
DOCX ↔ Markdown 동기화 검증 스크립트

DOCX에서 텍스트를 추출하고 Markdown 파일의 텍스트와 비교하여
불일치 항목을 리포트한다.
"""

import difflib
import re
import sys
from pathlib import Path

from docx import Document

BASE_DIR = Path(__file__).resolve().parent.parent
DOCX_DIR = BASE_DIR / "docx"
MD_DIR = BASE_DIR / "markdown"

# DOCX → Markdown 파일 매핑
FILE_MAP = {
    "01_고객_서비스이용계약서_v4_0_전자서명용.docx": "01-service-agreement.md",
    "비밀유지서약서.docx": "02-nda.md",
    "영업파트너 위촉계약서.docx": "03-partner-agreement.md",
    "영업파트너 위촉계약서(단체용).docx": "04-partner-agreement-group.md",
}


def extract_text_from_docx(docx_path):
    """DOCX에서 순수 텍스트만 추출 (개정이력 테이블 제외, 인터리빙 방식)"""
    doc = Document(str(docx_path))
    lines = []

    from docx.oxml.ns import qn as _qn

    body = doc.element.body
    para_idx = 0
    table_idx = 0
    skip_revision = False

    for child in body:
        tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag

        if tag == "p":
            if para_idx < len(doc.paragraphs):
                text = doc.paragraphs[para_idx].text.strip()
                para_idx += 1

                if "개정이력" in text:
                    skip_revision = True
                    continue
                if text:
                    skip_revision = False
                    lines.append(text)

        elif tag == "tbl":
            if table_idx < len(doc.tables):
                table = doc.tables[table_idx]
                table_idx += 1

                # 개정이력 테이블 건너뛰기
                if len(table.rows) > 0:
                    first_row_text = [cell.text.strip() for cell in table.rows[0].cells]
                    if "버전" in first_row_text and "날짜" in first_row_text:
                        skip_revision = False
                        continue

                if skip_revision:
                    skip_revision = False
                    continue

                for row in table.rows:
                    cells = [cell.text.strip() for cell in row.cells]
                    # 빈 셀만 있는 행 건너뛰기
                    if not any(cells):
                        continue
                    row_text = " | ".join(cells)
                    if row_text.strip():
                        lines.append(row_text)

    return lines


def extract_text_from_markdown(md_path):
    """Markdown에서 순수 텍스트만 추출 (프론트매터, 마크업 제거)"""
    content = md_path.read_text(encoding="utf-8")
    lines = []

    in_frontmatter = False
    in_table = False

    for line in content.split("\n"):
        stripped = line.strip()

        # YAML 프론트매터 건너뛰기
        if stripped == "---":
            in_frontmatter = not in_frontmatter
            continue
        if in_frontmatter:
            continue

        # 빈 줄 건너뛰기
        if not stripped:
            in_table = False
            continue

        # Markdown 마크업 제거
        text = stripped

        # 헤딩 마크업 제거
        text = re.sub(r"^#{1,6}\s+", "", text)

        # 리스트 마크업 제거
        text = re.sub(r"^\s*[-*+]\s+", "", text)

        # Bold/Italic 마크업 제거
        text = re.sub(r"\*\*(.+?)\*\*", r"\1", text)
        text = re.sub(r"\*(.+?)\*", r"\1", text)

        # 블록인용 제거
        text = re.sub(r"^>\s*", "", text)

        # 테이블 구분선 건너뛰기
        if re.match(r"^\|[\s\-|]+\|$", text):
            continue

        # 테이블 행
        if text.startswith("|") and text.endswith("|"):
            # 파이프 제거하고 셀 텍스트 추출
            cells = [c.strip() for c in text.strip("|").split("|")]
            text = " | ".join(cells)

        text = text.strip()
        if text:
            lines.append(text)

    return lines


def normalize_text(text):
    """비교를 위한 텍스트 정규화"""
    # 공백 정규화
    text = re.sub(r"\s+", " ", text).strip()
    # 특수문자 정규화
    text = text.replace("\u00a0", " ")  # non-breaking space
    text = text.replace("\u3000", " ")  # ideographic space
    # 언더스코어 빈칸 정규화
    text = re.sub(r"_{3,}", "___", text)
    # Bold 마크업(**) 제거 (DOCX 텍스트에 리터럴 ** 포함되는 경우)
    text = re.sub(r"\*\*(.+?)\*\*", r"\1", text)
    # 선행 리스트 마커 제거 (DOCX 텍스트가 "- "로 시작하는 경우)
    text = re.sub(r"^-\s+", "", text)
    return text


def compare_documents(docx_name, md_name):
    """두 문서의 텍스트를 비교"""
    docx_path = DOCX_DIR / docx_name
    md_path = MD_DIR / md_name

    if not docx_path.exists():
        return {"status": "error", "message": f"DOCX 파일 없음: {docx_name}"}
    if not md_path.exists():
        return {"status": "error", "message": f"Markdown 파일 없음: {md_name}"}

    docx_lines = [normalize_text(l) for l in extract_text_from_docx(docx_path) if l.strip()]
    md_lines = [normalize_text(l) for l in extract_text_from_markdown(md_path) if l.strip()]

    # difflib로 비교
    matcher = difflib.SequenceMatcher(None, docx_lines, md_lines)
    ratio = matcher.ratio()

    # 차이점 추출
    diffs = []
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == "equal":
            continue
        elif tag == "replace":
            for idx in range(max(i2 - i1, j2 - j1)):
                docx_text = docx_lines[i1 + idx] if i1 + idx < i2 else "(없음)"
                md_text = md_lines[j1 + idx] if j1 + idx < j2 else "(없음)"
                diffs.append({
                    "type": "변경",
                    "docx": docx_text[:80],
                    "markdown": md_text[:80],
                })
        elif tag == "delete":
            for idx in range(i1, i2):
                diffs.append({
                    "type": "DOCX에만 존재",
                    "docx": docx_lines[idx][:80],
                    "markdown": "-",
                })
        elif tag == "insert":
            for idx in range(j1, j2):
                diffs.append({
                    "type": "Markdown에만 존재",
                    "docx": "-",
                    "markdown": md_lines[idx][:80],
                })

    return {
        "status": "ok",
        "similarity": round(ratio * 100, 1),
        "docx_lines": len(docx_lines),
        "md_lines": len(md_lines),
        "diff_count": len(diffs),
        "diffs": diffs[:20],  # 상위 20개만
    }


def main():
    print("=" * 70)
    print("DOCX ↔ Markdown 동기화 검증")
    print("=" * 70)

    all_ok = True

    for docx_name, md_name in FILE_MAP.items():
        print(f"\n{'─' * 50}")
        print(f"문서: {docx_name}")
        print(f"  ↔ {md_name}")
        print(f"{'─' * 50}")

        result = compare_documents(docx_name, md_name)

        if result["status"] == "error":
            print(f"  [ERROR] {result['message']}")
            all_ok = False
            continue

        similarity = result["similarity"]
        status_icon = "OK" if similarity >= 80 else "WARN" if similarity >= 60 else "FAIL"

        print(f"  유사도: {similarity}% [{status_icon}]")
        print(f"  DOCX 라인: {result['docx_lines']}")
        print(f"  Markdown 라인: {result['md_lines']}")
        print(f"  차이점: {result['diff_count']}개")

        if result["diffs"]:
            print(f"\n  주요 차이점 (상위 {min(len(result['diffs']), 10)}개):")
            for i, diff in enumerate(result["diffs"][:10]):
                print(f"    [{diff['type']}]")
                if diff["docx"] != "-":
                    print(f"      DOCX: {diff['docx']}")
                if diff["markdown"] != "-":
                    print(f"      MD:   {diff['markdown']}")

        if similarity < 80:
            all_ok = False

    print(f"\n{'=' * 70}")
    if all_ok:
        print("결과: 모든 문서 동기화 상태 양호")
    else:
        print("결과: 일부 문서에서 불일치 발견 - 확인 필요")
    print(f"{'=' * 70}")

    return 0 if all_ok else 1


if __name__ == "__main__":
    sys.exit(main())
feat: [contracts] 계약서 버전 관리 시스템 구축 - DOCX 4종 → Markdown 미러링 체계 구축 (Git diff 추적) - DOCX에 개정이력 테이블 삽입 (Pretendard 9pt, 파란 헤더) - 자동화 스크립트 3종 (추출/삽입/동기화 검증) - revisions.json, CHANGELOG.md, INDEX.md 업데이트 - .gitignore에 contracts 경로 allowlist 추가 2026-02-22 17:42:31 +09:00			`#!/usr/bin/env python3`
			`"""`
			`DOCX ↔ Markdown 동기화 검증 스크립트`

			`DOCX에서 텍스트를 추출하고 Markdown 파일의 텍스트와 비교하여`
			`불일치 항목을 리포트한다.`
			`"""`

			`import difflib`
			`import re`
			`import sys`
			`from pathlib import Path`

			`from docx import Document`

			`BASE_DIR = Path(__file__).resolve().parent.parent`
			`DOCX_DIR = BASE_DIR / "docx"`
			`MD_DIR = BASE_DIR / "markdown"`

			`# DOCX → Markdown 파일 매핑`
			`FILE_MAP = {`
			`"01_고객_서비스이용계약서_v4_0_전자서명용.docx": "01-service-agreement.md",`
			`"비밀유지서약서.docx": "02-nda.md",`
			`"영업파트너 위촉계약서.docx": "03-partner-agreement.md",`
			`"영업파트너 위촉계약서(단체용).docx": "04-partner-agreement-group.md",`
			`}`


			`def extract_text_from_docx(docx_path):`
			`"""DOCX에서 순수 텍스트만 추출 (개정이력 테이블 제외, 인터리빙 방식)"""`
			`doc = Document(str(docx_path))`
			`lines = []`

			`from docx.oxml.ns import qn as _qn`

			`body = doc.element.body`
			`para_idx = 0`
			`table_idx = 0`
			`skip_revision = False`

			`for child in body:`
			`tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag`

			`if tag == "p":`
			`if para_idx < len(doc.paragraphs):`
			`text = doc.paragraphs[para_idx].text.strip()`
			`para_idx += 1`

			`if "개정이력" in text:`
			`skip_revision = True`
			`continue`
			`if text:`
			`skip_revision = False`
			`lines.append(text)`

			`elif tag == "tbl":`
			`if table_idx < len(doc.tables):`
			`table = doc.tables[table_idx]`
			`table_idx += 1`

			`# 개정이력 테이블 건너뛰기`
			`if len(table.rows) > 0:`
			`first_row_text = [cell.text.strip() for cell in table.rows[0].cells]`
			`if "버전" in first_row_text and "날짜" in first_row_text:`
			`skip_revision = False`
			`continue`

			`if skip_revision:`
			`skip_revision = False`
			`continue`

			`for row in table.rows:`
fix: [contracts] Markdown ↔ DOCX 동기화 100% 달성 - 분할 문단 원복 (비밀유지서약서, 영업파트너 위촉계약서 2종) - 제목 꺾쇠(< >) 복원 (영업파트너 위촉계약서 2종) - 회사 이메일 누락 복원 (영업파트너 위촉계약서 2종) - sync_check 정규화 개선 (Bold 마커, 리스트 접두사, 빈 테이블 행) 2026-02-22 17:52:57 +09:00			`cells = [cell.text.strip() for cell in row.cells]`
			`# 빈 셀만 있는 행 건너뛰기`
			`if not any(cells):`
			`continue`
			`row_text = " \| ".join(cells)`
feat: [contracts] 계약서 버전 관리 시스템 구축 - DOCX 4종 → Markdown 미러링 체계 구축 (Git diff 추적) - DOCX에 개정이력 테이블 삽입 (Pretendard 9pt, 파란 헤더) - 자동화 스크립트 3종 (추출/삽입/동기화 검증) - revisions.json, CHANGELOG.md, INDEX.md 업데이트 - .gitignore에 contracts 경로 allowlist 추가 2026-02-22 17:42:31 +09:00			`if row_text.strip():`
			`lines.append(row_text)`

			`return lines`


			`def extract_text_from_markdown(md_path):`
			`"""Markdown에서 순수 텍스트만 추출 (프론트매터, 마크업 제거)"""`
			`content = md_path.read_text(encoding="utf-8")`
			`lines = []`

			`in_frontmatter = False`
			`in_table = False`

			`for line in content.split("\n"):`
			`stripped = line.strip()`

			`# YAML 프론트매터 건너뛰기`
			`if stripped == "---":`
			`in_frontmatter = not in_frontmatter`
			`continue`
			`if in_frontmatter:`
			`continue`

			`# 빈 줄 건너뛰기`
			`if not stripped:`
			`in_table = False`
			`continue`

			`# Markdown 마크업 제거`
			`text = stripped`

			`# 헤딩 마크업 제거`
			`text = re.sub(r"^#{1,6}\s+", "", text)`

			`# 리스트 마크업 제거`
			`text = re.sub(r"^\s[-+]\s+", "", text)`

			`# Bold/Italic 마크업 제거`
			`text = re.sub(r"\\(.+?)\\", r"\1", text)`
			`text = re.sub(r"\(.+?)\", r"\1", text)`

			`# 블록인용 제거`
			`text = re.sub(r"^>\s*", "", text)`

			`# 테이블 구분선 건너뛰기`
			`if re.match(r"^\\|[\s\-\|]+\\|$", text):`
			`continue`

			`# 테이블 행`
			`if text.startswith("\|") and text.endswith("\|"):`
			`# 파이프 제거하고 셀 텍스트 추출`
			`cells = [c.strip() for c in text.strip("\|").split("\|")]`
			`text = " \| ".join(cells)`

			`text = text.strip()`
			`if text:`
			`lines.append(text)`

			`return lines`


			`def normalize_text(text):`
			`"""비교를 위한 텍스트 정규화"""`
			`# 공백 정규화`
			`text = re.sub(r"\s+", " ", text).strip()`
			`# 특수문자 정규화`
			`text = text.replace("\u00a0", " ") # non-breaking space`
			`text = text.replace("\u3000", " ") # ideographic space`
			`# 언더스코어 빈칸 정규화`
			`text = re.sub(r"_{3,}", "___", text)`
fix: [contracts] Markdown ↔ DOCX 동기화 100% 달성 - 분할 문단 원복 (비밀유지서약서, 영업파트너 위촉계약서 2종) - 제목 꺾쇠(< >) 복원 (영업파트너 위촉계약서 2종) - 회사 이메일 누락 복원 (영업파트너 위촉계약서 2종) - sync_check 정규화 개선 (Bold 마커, 리스트 접두사, 빈 테이블 행) 2026-02-22 17:52:57 +09:00			`# Bold 마크업() 제거 (DOCX 텍스트에 리터럴 포함되는 경우)`
			`text = re.sub(r"\\(.+?)\\", r"\1", text)`
			`# 선행 리스트 마커 제거 (DOCX 텍스트가 "- "로 시작하는 경우)`
			`text = re.sub(r"^-\s+", "", text)`
feat: [contracts] 계약서 버전 관리 시스템 구축 - DOCX 4종 → Markdown 미러링 체계 구축 (Git diff 추적) - DOCX에 개정이력 테이블 삽입 (Pretendard 9pt, 파란 헤더) - 자동화 스크립트 3종 (추출/삽입/동기화 검증) - revisions.json, CHANGELOG.md, INDEX.md 업데이트 - .gitignore에 contracts 경로 allowlist 추가 2026-02-22 17:42:31 +09:00			`return text`


			`def compare_documents(docx_name, md_name):`
			`"""두 문서의 텍스트를 비교"""`
			`docx_path = DOCX_DIR / docx_name`
			`md_path = MD_DIR / md_name`

			`if not docx_path.exists():`
			`return {"status": "error", "message": f"DOCX 파일 없음: {docx_name}"}`
			`if not md_path.exists():`
			`return {"status": "error", "message": f"Markdown 파일 없음: {md_name}"}`

			`docx_lines = [normalize_text(l) for l in extract_text_from_docx(docx_path) if l.strip()]`
			`md_lines = [normalize_text(l) for l in extract_text_from_markdown(md_path) if l.strip()]`

			`# difflib로 비교`
			`matcher = difflib.SequenceMatcher(None, docx_lines, md_lines)`
			`ratio = matcher.ratio()`

			`# 차이점 추출`
			`diffs = []`
			`for tag, i1, i2, j1, j2 in matcher.get_opcodes():`
			`if tag == "equal":`
			`continue`
			`elif tag == "replace":`
			`for idx in range(max(i2 - i1, j2 - j1)):`
			`docx_text = docx_lines[i1 + idx] if i1 + idx < i2 else "(없음)"`
			`md_text = md_lines[j1 + idx] if j1 + idx < j2 else "(없음)"`
			`diffs.append({`
			`"type": "변경",`
			`"docx": docx_text[:80],`
			`"markdown": md_text[:80],`
			`})`
			`elif tag == "delete":`
			`for idx in range(i1, i2):`
			`diffs.append({`
			`"type": "DOCX에만 존재",`
			`"docx": docx_lines[idx][:80],`
			`"markdown": "-",`
			`})`
			`elif tag == "insert":`
			`for idx in range(j1, j2):`
			`diffs.append({`
			`"type": "Markdown에만 존재",`
			`"docx": "-",`
			`"markdown": md_lines[idx][:80],`
			`})`

			`return {`
			`"status": "ok",`
			`"similarity": round(ratio * 100, 1),`
			`"docx_lines": len(docx_lines),`
			`"md_lines": len(md_lines),`
			`"diff_count": len(diffs),`
			`"diffs": diffs[:20], # 상위 20개만`
			`}`


			`def main():`
			`print("=" * 70)`
			`print("DOCX ↔ Markdown 동기화 검증")`
			`print("=" * 70)`

			`all_ok = True`

			`for docx_name, md_name in FILE_MAP.items():`
			`print(f"\n{'─' * 50}")`
			`print(f"문서: {docx_name}")`
			`print(f" ↔ {md_name}")`
			`print(f"{'─' * 50}")`

			`result = compare_documents(docx_name, md_name)`

			`if result["status"] == "error":`
			`print(f" [ERROR] {result['message']}")`
			`all_ok = False`
			`continue`

			`similarity = result["similarity"]`
			`status_icon = "OK" if similarity >= 80 else "WARN" if similarity >= 60 else "FAIL"`

			`print(f" 유사도: {similarity}% [{status_icon}]")`
			`print(f" DOCX 라인: {result['docx_lines']}")`
			`print(f" Markdown 라인: {result['md_lines']}")`
			`print(f" 차이점: {result['diff_count']}개")`

			`if result["diffs"]:`
			`print(f"\n 주요 차이점 (상위 {min(len(result['diffs']), 10)}개):")`
			`for i, diff in enumerate(result["diffs"][:10]):`
			`print(f" [{diff['type']}]")`
			`if diff["docx"] != "-":`
			`print(f" DOCX: {diff['docx']}")`
			`if diff["markdown"] != "-":`
			`print(f" MD: {diff['markdown']}")`

			`if similarity < 80:`
			`all_ok = False`

			`print(f"\n{'=' * 70}")`
			`if all_ok:`
			`print("결과: 모든 문서 동기화 상태 양호")`
			`else:`
			`print("결과: 일부 문서에서 불일치 발견 - 확인 필요")`
			`print(f"{'=' * 70}")`

			`return 0 if all_ok else 1`


			`if __name__ == "__main__":`
			`sys.exit(main())`