update
This commit is contained in:
151
data_structure/convert.py
Normal file
151
data_structure/convert.py
Normal file
@@ -0,0 +1,151 @@
|
||||
import mammoth
|
||||
import os
|
||||
import pathlib
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def format_text(text):
|
||||
"""
|
||||
Format text with proper spacing around punctuation.
|
||||
|
||||
Rules:
|
||||
- No space before punctuation marks (;:,.)
|
||||
- One space after punctuation marks
|
||||
- Preserve line breaks and multiple spaces in specific contexts
|
||||
- Preserve commas in decimal numbers
|
||||
"""
|
||||
|
||||
# Handle decimal numbers first (preserve commas in numbers like 3,14)
|
||||
# This finds decimal numbers and temporarily replaces the comma with a placeholder
|
||||
def protect_decimal_comma(match):
|
||||
return match.group(0).replace(",", "##DECIMAL##")
|
||||
|
||||
# Find numbers with commas (e.g. 3,14) and protect them
|
||||
text = re.sub(r"(\d+),(\d+)", protect_decimal_comma, text)
|
||||
|
||||
# Fix colon followed immediately by text (no space)
|
||||
text = re.sub(r":(\w)", r": \1", text)
|
||||
|
||||
# Fix semicolon followed immediately by text (no space)
|
||||
text = re.sub(r";(\w)", r"; \1", text)
|
||||
|
||||
# Fix comma followed immediately by text (no space)
|
||||
text = re.sub(r",(\w)", r", \1", text)
|
||||
|
||||
# Fix period followed immediately by text (no space) - only if not part of an abbreviation
|
||||
text = re.sub(r"\.(\w)", r". \1", text)
|
||||
|
||||
# Fix spaces before punctuation
|
||||
text = re.sub(r"\s+([;:,.)])", r"\1", text)
|
||||
|
||||
# Restore the decimal commas
|
||||
text = text.replace("##DECIMAL##", ",")
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def convert_docx_to_html(
|
||||
docx_path, output_path, apply_formatting=True, remove_first_strong=True
|
||||
):
|
||||
with open(docx_path, "rb") as docx_file:
|
||||
result = mammoth.convert_to_html(docx_file)
|
||||
html = result.value
|
||||
|
||||
# Remove the first <strong> tag content inside a <p> tag if requested
|
||||
if remove_first_strong:
|
||||
# Find and remove only the first occurrence of a <p> tag containing only a <strong> tag
|
||||
html = re.sub(r"<p>\s*<strong>(.*?)</strong>\s*</p>", "", html, count=1)
|
||||
|
||||
# Clean up any empty paragraph tags that might be left
|
||||
html = re.sub(r"<p>\s*</p>", "", html)
|
||||
|
||||
if apply_formatting:
|
||||
# Use BeautifulSoup for better HTML parsing
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Process text nodes only
|
||||
for text_node in soup.find_all(text=True):
|
||||
if text_node.parent.name not in ["pre", "code", "script", "style"]:
|
||||
new_text = format_text(text_node.string)
|
||||
text_node.replace_with(new_text)
|
||||
|
||||
html = str(soup)
|
||||
|
||||
# Write the HTML to a file
|
||||
with open(output_path, "w", encoding="utf-8") as html_file:
|
||||
html_file.write(html)
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
def convert_all_docx_files(
|
||||
input_dir, output_dir, apply_formatting=True, remove_first_strong=True
|
||||
):
|
||||
# Create output directory if it doesn't exist
|
||||
pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Get all docx files in the input directory
|
||||
docx_files = [f for f in os.listdir(input_dir) if f.endswith(".docx")]
|
||||
|
||||
# Convert each docx file to html
|
||||
for docx_file in docx_files:
|
||||
docx_path = os.path.join(input_dir, docx_file)
|
||||
html_file = os.path.splitext(docx_file)[0] + ".html"
|
||||
html_path = os.path.join(output_dir, html_file)
|
||||
|
||||
print(f"Converting {docx_path} to {html_path}")
|
||||
convert_docx_to_html(
|
||||
docx_path, html_path, apply_formatting, remove_first_strong
|
||||
)
|
||||
|
||||
print(f"Converted {len(docx_files)} files from {input_dir} to {output_dir}")
|
||||
|
||||
|
||||
def post_process_html_files(directory):
|
||||
"""
|
||||
Process already converted HTML files to fix punctuation spacing.
|
||||
This is useful for fixing existing files without re-converting from DOCX.
|
||||
"""
|
||||
html_files = [f for f in os.listdir(directory) if f.endswith(".html")]
|
||||
|
||||
for html_file in html_files:
|
||||
file_path = os.path.join(directory, html_file)
|
||||
print(f"Post-processing {file_path}")
|
||||
|
||||
# Read the HTML file
|
||||
with open(file_path, "r", encoding="utf-8") as file:
|
||||
html = file.read()
|
||||
|
||||
# Use BeautifulSoup for better HTML parsing
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Process text nodes only
|
||||
for text_node in soup.find_all(text=True):
|
||||
if text_node.parent.name not in ["pre", "code", "script", "style"]:
|
||||
new_text = format_text(text_node.string)
|
||||
text_node.replace_with(new_text)
|
||||
|
||||
# Write the processed HTML back to the file
|
||||
with open(file_path, "w", encoding="utf-8") as file:
|
||||
file.write(str(soup))
|
||||
|
||||
print(f"Post-processed {len(html_files)} HTML files in {directory}")
|
||||
|
||||
|
||||
# Run the conversion
|
||||
if __name__ == "__main__":
|
||||
input_directory = "./data"
|
||||
output_directory = "./data_html"
|
||||
|
||||
# Option 1: Convert DOCX to HTML
|
||||
convert_all_docx_files(
|
||||
input_directory,
|
||||
output_directory,
|
||||
apply_formatting=True,
|
||||
remove_first_strong=True,
|
||||
)
|
||||
|
||||
# Option 2: Post-process existing HTML files
|
||||
# Uncomment the line below to use this option
|
||||
# post_process_html_files(output_directory)
|
||||
Reference in New Issue
Block a user