update

2025-05-09 05:30:08 +02:00
parent 7bb10e7df4
commit 73367bad9e
5322 changed files with 1266973 additions and 313 deletions
--- a/data_structure/analyze.py
+++ b/data_structure/analyze.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+import os
+import glob
+import re
+import zipfile
+import subprocess
+import xml.etree.ElementTree as ET
+import shutil
+import json
+from pathlib import Path
+import difflib
+import hashlib
+import docx
+from docx2pdf import convert
+
+
+def clean_string(input_string):
+
+    # Remove everything up to and including the first numbers and any period that follows,
+    # then any whitespace
+    result = re.sub(r"^.*?(\d+)\.?\s*", "", input_string)
+
+    # Fix spacing around punctuation (periods, colons, commas)
+    # Add space after punctuation if followed by non-whitespace
+    result = re.sub(r"([.,:])\s*([^\s.,:])", r"\1 \2", result)
+
+    # Remove space before punctuation
+    result = re.sub(r"\s+([.,:])", r"\1", result)
+
+    # Replace multiple consecutive punctuation marks with a single one
+    result = re.sub(r"\.{2,}", ".", result)
+    result = re.sub(r":{2,}", ":", result)
+    result = re.sub(r",{2,}", ",", result)
+
+    # Remove ending dot
+    result = re.sub(r"\.$", "", result)
+
+    # Replace multiple whitespaces with a single space
+    result = re.sub(r"\s{2,}", " ", result)
+
+    result = re.sub(r"^\.?\s*тема:\s*", "", result)
+
+    # Trim leading and trailing whitespace
+    result = result.strip()
+
+    return result
+
+
+def read_first_line_docx(filename):
+    try:
+        doc = docx.Document(filename)
+
+        # Check if document has any content
+        if not doc.paragraphs:
+            return "Document contains no text"
+
+        # Look for the first non-empty paragraph
+        for i, para in enumerate(doc.paragraphs[:5]):  # Check first 5 paragraphs
+            text = para.text.strip()
+            if text:
+                # Check if this paragraph has title-like formatting
+                is_title = False
+                for run in para.runs:
+                    if run.bold or run.font.size and run.font.size > 12:
+                        is_title = True
+                        break
+
+                # Return this paragraph if it's formatted like a title or if it's the first with text
+                if is_title or i == 0:
+                    # Split by newlines and take first line
+                    first_line = text.split("\n")[0].strip()
+                    return clean_string(first_line)
+
+        # If we reach here, use the first paragraph that has any text
+        for para in doc.paragraphs:
+            if para.text.strip():
+                return clean_string(para.text.split("\n")[0].strip())
+
+        return "Document contains no text"
+
+    except Exception as e:
+        return f"Error reading document: {str(e)}"
+
+
+def group_duplicates(items):
+    grouped = {}
+
+    # Group by title
+    for item in items:
+        title = item["title"]
+        if title not in grouped:
+            grouped[title] = []
+        grouped[title].append(item)
+
+    # Convert to list of groups
+    result = list(grouped.values())
+    return result
+
+
+def main():
+    text = []
+
+    data_dir = "./data"
+    filenames = os.listdir(data_dir)
+    for i, filename in enumerate(filenames):
+        filepath = os.path.join(data_dir, filename)
+        first_line = read_first_line_docx(filepath)
+        text.append({"filename": filename, "title": first_line})
+
+    final_structure = []
+    grouped = group_duplicates(text)
+    for i, g in enumerate(grouped, 1):
+        final_structure.append(
+            {
+                "id": f"T{i:03d}",
+                "title": g[0]["title"],
+                "files": [t["filename"].replace("docx", "pdf") for t in g],
+            }
+        )
+    with open("structure.json", "w", encoding="utf-8") as json_file:
+        json.dump(final_structure, json_file, ensure_ascii=False, indent=4)
+
+
+def rename_files():
+    docx_files = glob.glob("./data/*.docx")
+
+    # Sort the files to ensure consistent numbering
+    docx_files.sort()
+
+    # Rename each file with a counter
+    for i, file_path in enumerate(docx_files, 1):
+        # Create the new filename with 3 digits (001, 002, etc.)
+        new_filename = f"./data/f{i:03d}.docx"
+
+        # Rename the file
+        os.rename(file_path, new_filename)
+        print(f"Renamed: {file_path} -> {new_filename}")
+
+
+if __name__ == "__main__":
+    main()