med-notes/data_structure/docx_to_md.py

#!/usr/bin/env python3
"""
Convert a directory of DOCX documents to Markdown format with proper formatting.
"""

import os
import re
import argparse
from pathlib import Path
from typing import Optional, List

# Check each library individually
MAMMOTH_AVAILABLE = False
PANDOC_AVAILABLE = False
DOCX_AVAILABLE = False

try:
    import mammoth

    MAMMOTH_AVAILABLE = True
    print("✓ mammoth library available")
except ImportError:
    print("✗ mammoth not available")

try:
    import pypandoc

    PANDOC_AVAILABLE = True
    print("✓ pypandoc library available")
except ImportError:
    print("✗ pypandoc not available")

try:
    from docx import Document
    from markdownify import markdownify as md

    DOCX_AVAILABLE = True
    print("✓ python-docx library available")
except ImportError:
    print("✗ python-docx not available")

# Show available methods
available_methods = []
if MAMMOTH_AVAILABLE:
    available_methods.append("mammoth")
if PANDOC_AVAILABLE:
    available_methods.append("pandoc")
if DOCX_AVAILABLE:
    available_methods.append("python-docx")

if available_methods:
    print(f"Available conversion methods: {', '.join(available_methods)}")
else:
    print("No conversion libraries available!")
    print("Install with: pip install mammoth pypandoc python-docx markdownify")


def clean_markdown_text(text: str) -> str:
    """
    Clean and format markdown text using better approaches than manual regex.

    This function addresses common formatting issues:
    - Multiple whitespaces
    - Inconsistent list formatting
    - Paragraph spacing
    - Line break normalization
    """

    # 1. Normalize line endings
    text = text.replace("\r\n", "\n").replace("\r", "\n")

    # 2. Fix multiple consecutive spaces (but preserve code blocks)
    lines = text.split("\n")
    cleaned_lines = []
    in_code_block = False

    for line in lines:
        # Check if we're entering/exiting a code block
        if line.strip().startswith("```"):
            in_code_block = not in_code_block
            cleaned_lines.append(line)
            continue

        if not in_code_block:
            # Replace multiple spaces with single space, but preserve indentation
            leading_spaces = len(line) - len(line.lstrip())
            content = line.lstrip()
            # Collapse multiple spaces in content
            content = re.sub(r" {2,}", " ", content)
            line = " " * leading_spaces + content

        cleaned_lines.append(line)

    text = "\n".join(cleaned_lines)

    # 3. Fix list formatting
    # Ensure proper spacing for unordered lists
    text = re.sub(r"^(\s*)[-*+](\S)", r"\1- \2", text, flags=re.MULTILINE)

    # Ensure proper spacing for ordered lists
    text = re.sub(r"^(\s*)(\d+\.)(\S)", r"\1\2 \3", text, flags=re.MULTILINE)

    # 4. Fix paragraph spacing - remove excessive blank lines
    text = re.sub(r"\n{3,}", "\n\n", text)

    # 5. Clean up common punctuation issues (more targeted than your original)
    # Fix missing space after punctuation at end of sentences
    text = re.sub(r"([.!?])([A-Z])", r"\1 \2", text)

    # Fix missing space after commas (but not in numbers)
    text = re.sub(r",(?=[a-zA-Z])", ", ", text)

    # Fix missing space after colons and semicolons
    text = re.sub(r"([;:])(?=[a-zA-Z])", r"\1 ", text)

    # Remove spaces before punctuation
    text = re.sub(r" +([,.;:!?])", r"\1", text)

    # 6. Clean up markdown formatting issues
    # Fix bold/italic markers with spaces
    text = re.sub(r"\*\* +", "**", text)
    text = re.sub(r" +\*\*", "**", text)
    text = re.sub(r"\* +", "*", text)
    text = re.sub(r" +\*", "*", text)

    # 7. Final cleanup
    text = text.strip()

    return text


def convert_docx_with_mammoth(docx_path: Path) -> str:
    """Convert DOCX to Markdown using mammoth (recommended approach)."""
    try:
        with open(docx_path, "rb") as docx_file:
            result = mammoth.convert_to_markdown(docx_file)
            if result.messages:
                print(f"Mammoth warnings: {result.messages}")
            return result.value
    except Exception as e:
        print(f"Mammoth conversion error: {e}")
        raise


def convert_docx_with_pandoc(docx_path: Path) -> str:
    """Convert DOCX to Markdown using pandoc (alternative approach)."""
    return pypandoc.convert_file(str(docx_path), "md", format="docx")


def convert_docx_with_python_docx(docx_path: Path) -> str:
    """Convert DOCX to Markdown using python-docx (basic fallback)."""
    doc = Document(docx_path)

    content = []
    for paragraph in doc.paragraphs:
        # Simple paragraph to markdown conversion
        text = paragraph.text.strip()
        if text:
            # Basic heading detection (you might want to improve this)
            if paragraph.style.name.startswith("Heading"):
                level = (
                    int(paragraph.style.name.split()[-1])
                    if paragraph.style.name.split()[-1].isdigit()
                    else 1
                )
                content.append("#" * level + " " + text)
            else:
                content.append(text)
            content.append("")  # Add blank line after each paragraph

    # Handle tables
    for table in doc.tables:
        content.append("")  # Add blank line before table
        for i, row in enumerate(table.rows):
            row_data = [cell.text.strip() for cell in row.cells]
            content.append("| " + " | ".join(row_data) + " |")
            if i == 0:  # Add separator after header
                content.append("|" + "---|" * len(row_data))
        content.append("")  # Add blank line after table

    return "\n".join(content)


def convert_single_docx(
    input_path: Path, output_dir: Path, method: str = "mammoth"
) -> bool:
    """
    Convert a single DOCX file to Markdown.

    Args:
        input_path: Path to the DOCX file
        output_dir: Directory to save the Markdown file
        method: Conversion method ('mammoth', 'pandoc', or 'python-docx')

    Returns:
        True if successful, False otherwise
    """
    try:
        print(f"Converting: {input_path.name}")

        # Choose conversion method
        if method == "mammoth" and MAMMOTH_AVAILABLE:
            markdown_content = convert_docx_with_mammoth(input_path)
        elif method == "pandoc" and PANDOC_AVAILABLE:
            markdown_content = convert_docx_with_pandoc(input_path)
        elif method == "python-docx" and DOCX_AVAILABLE:
            markdown_content = convert_docx_with_python_docx(input_path)
        else:
            print(f"Method '{method}' not available. Trying alternatives...")
            if MAMMOTH_AVAILABLE:
                markdown_content = convert_docx_with_mammoth(input_path)
            elif PANDOC_AVAILABLE:
                markdown_content = convert_docx_with_pandoc(input_path)
            elif DOCX_AVAILABLE:
                markdown_content = convert_docx_with_python_docx(input_path)
            else:
                print("No conversion libraries available!")
                return False

        # Clean and format the markdown
        cleaned_content = clean_markdown_text(markdown_content)

        # Create output file path
        output_path = output_dir / f"{input_path.stem}.md"

        # Write the markdown file
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(cleaned_content)

        print(f"✓ Saved: {output_path}")
        return True

    except Exception as e:
        print(f"✗ Error converting {input_path.name}: {str(e)}")
        return False


def convert_directory(
    input_dir: str, output_dir: str, method: str = "mammoth", recursive: bool = False
):
    """
    Convert all DOCX files in a directory to Markdown.

    Args:
        input_dir: Directory containing DOCX files
        output_dir: Directory to save Markdown files
        method: Conversion method to use
        recursive: Whether to search subdirectories
    """
    input_path = Path(input_dir)
    output_path = Path(output_dir)

    # Validate input directory
    if not input_path.exists():
        print(f"Error: Input directory '{input_dir}' does not exist.")
        return

    # Create output directory if it doesn't exist
    output_path.mkdir(parents=True, exist_ok=True)

    # Find all DOCX files
    if recursive:
        docx_files = list(input_path.rglob("*.docx"))
    else:
        docx_files = list(input_path.glob("*.docx"))

    # Filter out temporary files (start with ~$)
    docx_files = [f for f in docx_files if not f.name.startswith("~$")]

    if not docx_files:
        print("No DOCX files found in the specified directory.")
        return

    print(f"Found {len(docx_files)} DOCX files to convert...")

    # Convert each file
    successful = 0
    failed = 0

    for docx_file in docx_files:
        if convert_single_docx(docx_file, output_path, method):
            successful += 1
        else:
            failed += 1

    print(f"\nConversion complete:")
    print(f"✓ Successful: {successful}")
    print(f"✗ Failed: {failed}")


def main():
    parser = argparse.ArgumentParser(
        description="Convert DOCX documents to Markdown format",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python docx_to_md.py /path/to/docx/files /path/to/output
  python docx_to_md.py ./documents ./markdown --method pandoc --recursive
  python docx_to_md.py ~/docs ~/markdown -r
        """,
    )

    parser.add_argument("input_dir", help="Directory containing DOCX files")
    parser.add_argument("output_dir", help="Directory to save Markdown files")
    parser.add_argument(
        "--method",
        "-m",
        choices=["mammoth", "pandoc", "python-docx"],
        default="mammoth",
        help="Conversion method (default: mammoth)",
    )
    parser.add_argument(
        "--recursive",
        "-r",
        action="store_true",
        help="Search subdirectories recursively",
    )

    args = parser.parse_args()

    print("DOCX to Markdown Converter")
    print("=" * 30)
    print(f"Input directory: {args.input_dir}")
    print(f"Output directory: {args.output_dir}")
    print(f"Method: {args.method}")
    print(f"Recursive: {args.recursive}")
    print()

    convert_directory(args.input_dir, args.output_dir, args.method, args.recursive)


if __name__ == "__main__":
    main()