update

2025-05-09 05:30:08 +02:00
parent 7bb10e7df4
commit 73367bad9e
5322 changed files with 1266973 additions and 313 deletions
--- a/data_structure/convert.py
+++ b/data_structure/convert.py
@@ -0,0 +1,151 @@
+import mammoth
+import os
+import pathlib
+import re
+from bs4 import BeautifulSoup
+
+
+def format_text(text):
+    """
+    Format text with proper spacing around punctuation.
+
+    Rules:
+    - No space before punctuation marks (;:,.)
+    - One space after punctuation marks
+    - Preserve line breaks and multiple spaces in specific contexts
+    - Preserve commas in decimal numbers
+    """
+
+    # Handle decimal numbers first (preserve commas in numbers like 3,14)
+    # This finds decimal numbers and temporarily replaces the comma with a placeholder
+    def protect_decimal_comma(match):
+        return match.group(0).replace(",", "##DECIMAL##")
+
+    # Find numbers with commas (e.g. 3,14) and protect them
+    text = re.sub(r"(\d+),(\d+)", protect_decimal_comma, text)
+
+    # Fix colon followed immediately by text (no space)
+    text = re.sub(r":(\w)", r": \1", text)
+
+    # Fix semicolon followed immediately by text (no space)
+    text = re.sub(r";(\w)", r"; \1", text)
+
+    # Fix comma followed immediately by text (no space)
+    text = re.sub(r",(\w)", r", \1", text)
+
+    # Fix period followed immediately by text (no space) - only if not part of an abbreviation
+    text = re.sub(r"\.(\w)", r". \1", text)
+
+    # Fix spaces before punctuation
+    text = re.sub(r"\s+([;:,.)])", r"\1", text)
+
+    # Restore the decimal commas
+    text = text.replace("##DECIMAL##", ",")
+
+    return text
+
+
+def convert_docx_to_html(
+    docx_path, output_path, apply_formatting=True, remove_first_strong=True
+):
+    with open(docx_path, "rb") as docx_file:
+        result = mammoth.convert_to_html(docx_file)
+        html = result.value
+
+        # Remove the first <strong> tag content inside a <p> tag if requested
+        if remove_first_strong:
+            # Find and remove only the first occurrence of a <p> tag containing only a <strong> tag
+            html = re.sub(r"<p>\s*<strong>(.*?)</strong>\s*</p>", "", html, count=1)
+
+            # Clean up any empty paragraph tags that might be left
+            html = re.sub(r"<p>\s*</p>", "", html)
+
+        if apply_formatting:
+            # Use BeautifulSoup for better HTML parsing
+            soup = BeautifulSoup(html, "html.parser")
+
+            # Process text nodes only
+            for text_node in soup.find_all(text=True):
+                if text_node.parent.name not in ["pre", "code", "script", "style"]:
+                    new_text = format_text(text_node.string)
+                    text_node.replace_with(new_text)
+
+            html = str(soup)
+
+        # Write the HTML to a file
+        with open(output_path, "w", encoding="utf-8") as html_file:
+            html_file.write(html)
+
+        return output_path
+
+
+def convert_all_docx_files(
+    input_dir, output_dir, apply_formatting=True, remove_first_strong=True
+):
+    # Create output directory if it doesn't exist
+    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
+
+    # Get all docx files in the input directory
+    docx_files = [f for f in os.listdir(input_dir) if f.endswith(".docx")]
+
+    # Convert each docx file to html
+    for docx_file in docx_files:
+        docx_path = os.path.join(input_dir, docx_file)
+        html_file = os.path.splitext(docx_file)[0] + ".html"
+        html_path = os.path.join(output_dir, html_file)
+
+        print(f"Converting {docx_path} to {html_path}")
+        convert_docx_to_html(
+            docx_path, html_path, apply_formatting, remove_first_strong
+        )
+
+    print(f"Converted {len(docx_files)} files from {input_dir} to {output_dir}")
+
+
+def post_process_html_files(directory):
+    """
+    Process already converted HTML files to fix punctuation spacing.
+    This is useful for fixing existing files without re-converting from DOCX.
+    """
+    html_files = [f for f in os.listdir(directory) if f.endswith(".html")]
+
+    for html_file in html_files:
+        file_path = os.path.join(directory, html_file)
+        print(f"Post-processing {file_path}")
+
+        # Read the HTML file
+        with open(file_path, "r", encoding="utf-8") as file:
+            html = file.read()
+
+        # Use BeautifulSoup for better HTML parsing
+        soup = BeautifulSoup(html, "html.parser")
+
+        # Process text nodes only
+        for text_node in soup.find_all(text=True):
+            if text_node.parent.name not in ["pre", "code", "script", "style"]:
+                new_text = format_text(text_node.string)
+                text_node.replace_with(new_text)
+
+        # Write the processed HTML back to the file
+        with open(file_path, "w", encoding="utf-8") as file:
+            file.write(str(soup))
+
+    print(f"Post-processed {len(html_files)} HTML files in {directory}")
+
+
+# Run the conversion
+if __name__ == "__main__":
+    input_directory = "./data"
+    output_directory = "./data_html"
+
+    # Option 1: Convert DOCX to HTML
+    convert_all_docx_files(
+        input_directory,
+        output_directory,
+        apply_formatting=True,
+        remove_first_strong=True,
+    )
+
+    # Option 2: Post-process existing HTML files
+    # Uncomment the line below to use this option
+    # post_process_html_files(output_directory)