update

2025-06-29 13:04:14 +00:00
parent ad7bd9d5d6
commit 5c5e0bfe1a
56 changed files with 1749 additions and 0 deletions
--- a/data_structure/docx_to_md.py
+++ b/data_structure/docx_to_md.py
@@ -0,0 +1,333 @@
+#!/usr/bin/env python3
+"""
+Convert a directory of DOCX documents to Markdown format with proper formatting.
+"""
+
+import os
+import re
+import argparse
+from pathlib import Path
+from typing import Optional, List
+
+# Check each library individually
+MAMMOTH_AVAILABLE = False
+PANDOC_AVAILABLE = False
+DOCX_AVAILABLE = False
+
+try:
+    import mammoth
+
+    MAMMOTH_AVAILABLE = True
+    print("✓ mammoth library available")
+except ImportError:
+    print("✗ mammoth not available")
+
+try:
+    import pypandoc
+
+    PANDOC_AVAILABLE = True
+    print("✓ pypandoc library available")
+except ImportError:
+    print("✗ pypandoc not available")
+
+try:
+    from docx import Document
+    from markdownify import markdownify as md
+
+    DOCX_AVAILABLE = True
+    print("✓ python-docx library available")
+except ImportError:
+    print("✗ python-docx not available")
+
+# Show available methods
+available_methods = []
+if MAMMOTH_AVAILABLE:
+    available_methods.append("mammoth")
+if PANDOC_AVAILABLE:
+    available_methods.append("pandoc")
+if DOCX_AVAILABLE:
+    available_methods.append("python-docx")
+
+if available_methods:
+    print(f"Available conversion methods: {', '.join(available_methods)}")
+else:
+    print("No conversion libraries available!")
+    print("Install with: pip install mammoth pypandoc python-docx markdownify")
+
+
+def clean_markdown_text(text: str) -> str:
+    """
+    Clean and format markdown text using better approaches than manual regex.
+
+    This function addresses common formatting issues:
+    - Multiple whitespaces
+    - Inconsistent list formatting
+    - Paragraph spacing
+    - Line break normalization
+    """
+
+    # 1. Normalize line endings
+    text = text.replace("\r\n", "\n").replace("\r", "\n")
+
+    # 2. Fix multiple consecutive spaces (but preserve code blocks)
+    lines = text.split("\n")
+    cleaned_lines = []
+    in_code_block = False
+
+    for line in lines:
+        # Check if we're entering/exiting a code block
+        if line.strip().startswith("```"):
+            in_code_block = not in_code_block
+            cleaned_lines.append(line)
+            continue
+
+        if not in_code_block:
+            # Replace multiple spaces with single space, but preserve indentation
+            leading_spaces = len(line) - len(line.lstrip())
+            content = line.lstrip()
+            # Collapse multiple spaces in content
+            content = re.sub(r" {2,}", " ", content)
+            line = " " * leading_spaces + content
+
+        cleaned_lines.append(line)
+
+    text = "\n".join(cleaned_lines)
+
+    # 3. Fix list formatting
+    # Ensure proper spacing for unordered lists
+    text = re.sub(r"^(\s*)[-*+](\S)", r"\1- \2", text, flags=re.MULTILINE)
+
+    # Ensure proper spacing for ordered lists
+    text = re.sub(r"^(\s*)(\d+\.)(\S)", r"\1\2 \3", text, flags=re.MULTILINE)
+
+    # 4. Fix paragraph spacing - remove excessive blank lines
+    text = re.sub(r"\n{3,}", "\n\n", text)
+
+    # 5. Clean up common punctuation issues (more targeted than your original)
+    # Fix missing space after punctuation at end of sentences
+    text = re.sub(r"([.!?])([A-Z])", r"\1 \2", text)
+
+    # Fix missing space after commas (but not in numbers)
+    text = re.sub(r",(?=[a-zA-Z])", ", ", text)
+
+    # Fix missing space after colons and semicolons
+    text = re.sub(r"([;:])(?=[a-zA-Z])", r"\1 ", text)
+
+    # Remove spaces before punctuation
+    text = re.sub(r" +([,.;:!?])", r"\1", text)
+
+    # 6. Clean up markdown formatting issues
+    # Fix bold/italic markers with spaces
+    text = re.sub(r"\*\* +", "**", text)
+    text = re.sub(r" +\*\*", "**", text)
+    text = re.sub(r"\* +", "*", text)
+    text = re.sub(r" +\*", "*", text)
+
+    # 7. Final cleanup
+    text = text.strip()
+
+    return text
+
+
+def convert_docx_with_mammoth(docx_path: Path) -> str:
+    """Convert DOCX to Markdown using mammoth (recommended approach)."""
+    try:
+        with open(docx_path, "rb") as docx_file:
+            result = mammoth.convert_to_markdown(docx_file)
+            if result.messages:
+                print(f"Mammoth warnings: {result.messages}")
+            return result.value
+    except Exception as e:
+        print(f"Mammoth conversion error: {e}")
+        raise
+
+
+def convert_docx_with_pandoc(docx_path: Path) -> str:
+    """Convert DOCX to Markdown using pandoc (alternative approach)."""
+    return pypandoc.convert_file(str(docx_path), "md", format="docx")
+
+
+def convert_docx_with_python_docx(docx_path: Path) -> str:
+    """Convert DOCX to Markdown using python-docx (basic fallback)."""
+    doc = Document(docx_path)
+
+    content = []
+    for paragraph in doc.paragraphs:
+        # Simple paragraph to markdown conversion
+        text = paragraph.text.strip()
+        if text:
+            # Basic heading detection (you might want to improve this)
+            if paragraph.style.name.startswith("Heading"):
+                level = (
+                    int(paragraph.style.name.split()[-1])
+                    if paragraph.style.name.split()[-1].isdigit()
+                    else 1
+                )
+                content.append("#" * level + " " + text)
+            else:
+                content.append(text)
+            content.append("")  # Add blank line after each paragraph
+
+    # Handle tables
+    for table in doc.tables:
+        content.append("")  # Add blank line before table
+        for i, row in enumerate(table.rows):
+            row_data = [cell.text.strip() for cell in row.cells]
+            content.append("| " + " | ".join(row_data) + " |")
+            if i == 0:  # Add separator after header
+                content.append("|" + "---|" * len(row_data))
+        content.append("")  # Add blank line after table
+
+    return "\n".join(content)
+
+
+def convert_single_docx(
+    input_path: Path, output_dir: Path, method: str = "mammoth"
+) -> bool:
+    """
+    Convert a single DOCX file to Markdown.
+
+    Args:
+        input_path: Path to the DOCX file
+        output_dir: Directory to save the Markdown file
+        method: Conversion method ('mammoth', 'pandoc', or 'python-docx')
+
+    Returns:
+        True if successful, False otherwise
+    """
+    try:
+        print(f"Converting: {input_path.name}")
+
+        # Choose conversion method
+        if method == "mammoth" and MAMMOTH_AVAILABLE:
+            markdown_content = convert_docx_with_mammoth(input_path)
+        elif method == "pandoc" and PANDOC_AVAILABLE:
+            markdown_content = convert_docx_with_pandoc(input_path)
+        elif method == "python-docx" and DOCX_AVAILABLE:
+            markdown_content = convert_docx_with_python_docx(input_path)
+        else:
+            print(f"Method '{method}' not available. Trying alternatives...")
+            if MAMMOTH_AVAILABLE:
+                markdown_content = convert_docx_with_mammoth(input_path)
+            elif PANDOC_AVAILABLE:
+                markdown_content = convert_docx_with_pandoc(input_path)
+            elif DOCX_AVAILABLE:
+                markdown_content = convert_docx_with_python_docx(input_path)
+            else:
+                print("No conversion libraries available!")
+                return False
+
+        # Clean and format the markdown
+        cleaned_content = clean_markdown_text(markdown_content)
+
+        # Create output file path
+        output_path = output_dir / f"{input_path.stem}.md"
+
+        # Write the markdown file
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write(cleaned_content)
+
+        print(f"✓ Saved: {output_path}")
+        return True
+
+    except Exception as e:
+        print(f"✗ Error converting {input_path.name}: {str(e)}")
+        return False
+
+
+def convert_directory(
+    input_dir: str, output_dir: str, method: str = "mammoth", recursive: bool = False
+):
+    """
+    Convert all DOCX files in a directory to Markdown.
+
+    Args:
+        input_dir: Directory containing DOCX files
+        output_dir: Directory to save Markdown files
+        method: Conversion method to use
+        recursive: Whether to search subdirectories
+    """
+    input_path = Path(input_dir)
+    output_path = Path(output_dir)
+
+    # Validate input directory
+    if not input_path.exists():
+        print(f"Error: Input directory '{input_dir}' does not exist.")
+        return
+
+    # Create output directory if it doesn't exist
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    # Find all DOCX files
+    if recursive:
+        docx_files = list(input_path.rglob("*.docx"))
+    else:
+        docx_files = list(input_path.glob("*.docx"))
+
+    # Filter out temporary files (start with ~$)
+    docx_files = [f for f in docx_files if not f.name.startswith("~$")]
+
+    if not docx_files:
+        print("No DOCX files found in the specified directory.")
+        return
+
+    print(f"Found {len(docx_files)} DOCX files to convert...")
+
+    # Convert each file
+    successful = 0
+    failed = 0
+
+    for docx_file in docx_files:
+        if convert_single_docx(docx_file, output_path, method):
+            successful += 1
+        else:
+            failed += 1
+
+    print(f"\nConversion complete:")
+    print(f"✓ Successful: {successful}")
+    print(f"✗ Failed: {failed}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert DOCX documents to Markdown format",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python docx_to_md.py /path/to/docx/files /path/to/output
+  python docx_to_md.py ./documents ./markdown --method pandoc --recursive
+  python docx_to_md.py ~/docs ~/markdown -r
+        """,
+    )
+
+    parser.add_argument("input_dir", help="Directory containing DOCX files")
+    parser.add_argument("output_dir", help="Directory to save Markdown files")
+    parser.add_argument(
+        "--method",
+        "-m",
+        choices=["mammoth", "pandoc", "python-docx"],
+        default="mammoth",
+        help="Conversion method (default: mammoth)",
+    )
+    parser.add_argument(
+        "--recursive",
+        "-r",
+        action="store_true",
+        help="Search subdirectories recursively",
+    )
+
+    args = parser.parse_args()
+
+    print("DOCX to Markdown Converter")
+    print("=" * 30)
+    print(f"Input directory: {args.input_dir}")
+    print(f"Output directory: {args.output_dir}")
+    print(f"Method: {args.method}")
+    print(f"Recursive: {args.recursive}")
+    print()
+
+    convert_directory(args.input_dir, args.output_dir, args.method, args.recursive)
+
+
+if __name__ == "__main__":
+    main()