med-notes/data_structure/analyze.py

#!/usr/bin/env python3
import os
import glob
import re
import zipfile
import subprocess
import xml.etree.ElementTree as ET
import shutil
import json
from pathlib import Path
import difflib
import hashlib
import docx
from docx2pdf import convert


def clean_string(input_string):

    # Remove everything up to and including the first numbers and any period that follows,
    # then any whitespace
    result = re.sub(r"^.*?(\d+)\.?\s*", "", input_string)

    # Fix spacing around punctuation (periods, colons, commas)
    # Add space after punctuation if followed by non-whitespace
    result = re.sub(r"([.,:])\s*([^\s.,:])", r"\1 \2", result)

    # Remove space before punctuation
    result = re.sub(r"\s+([.,:])", r"\1", result)

    # Replace multiple consecutive punctuation marks with a single one
    result = re.sub(r"\.{2,}", ".", result)
    result = re.sub(r":{2,}", ":", result)
    result = re.sub(r",{2,}", ",", result)

    # Remove ending dot
    result = re.sub(r"\.$", "", result)

    # Replace multiple whitespaces with a single space
    result = re.sub(r"\s{2,}", " ", result)

    result = re.sub(r"^\.?\s*тема:\s*", "", result)

    # Trim leading and trailing whitespace
    result = result.strip()

    return result


def read_first_line_docx(filename):
    try:
        doc = docx.Document(filename)

        # Check if document has any content
        if not doc.paragraphs:
            return "Document contains no text"

        # Look for the first non-empty paragraph
        for i, para in enumerate(doc.paragraphs[:5]):  # Check first 5 paragraphs
            text = para.text.strip()
            if text:
                # Check if this paragraph has title-like formatting
                is_title = False
                for run in para.runs:
                    if run.bold or run.font.size and run.font.size > 12:
                        is_title = True
                        break

                # Return this paragraph if it's formatted like a title or if it's the first with text
                if is_title or i == 0:
                    # Split by newlines and take first line
                    first_line = text.split("\n")[0].strip()
                    return clean_string(first_line)

        # If we reach here, use the first paragraph that has any text
        for para in doc.paragraphs:
            if para.text.strip():
                return clean_string(para.text.split("\n")[0].strip())

        return "Document contains no text"

    except Exception as e:
        return f"Error reading document: {str(e)}"


def group_duplicates(items):
    grouped = {}

    # Group by title
    for item in items:
        title = item["title"]
        if title not in grouped:
            grouped[title] = []
        grouped[title].append(item)

    # Convert to list of groups
    result = list(grouped.values())
    return result


def main():
    text = []

    data_dir = "./data"
    filenames = os.listdir(data_dir)
    for i, filename in enumerate(filenames):
        filepath = os.path.join(data_dir, filename)
        first_line = read_first_line_docx(filepath)
        text.append({"filename": filename, "title": first_line})

    final_structure = []
    grouped = group_duplicates(text)
    for i, g in enumerate(grouped, 1):
        final_structure.append(
            {
                "id": f"T{i:03d}",
                "title": g[0]["title"],
                "files": [t["filename"].replace("docx", "pdf") for t in g],
            }
        )
    with open("structure.json", "w", encoding="utf-8") as json_file:
        json.dump(final_structure, json_file, ensure_ascii=False, indent=4)


def rename_files():
    docx_files = glob.glob("./data/*.docx")

    # Sort the files to ensure consistent numbering
    docx_files.sort()

    # Rename each file with a counter
    for i, file_path in enumerate(docx_files, 1):
        # Create the new filename with 3 digits (001, 002, etc.)
        new_filename = f"./data/f{i:03d}.docx"

        # Rename the file
        os.rename(file_path, new_filename)
        print(f"Renamed: {file_path} -> {new_filename}")


if __name__ == "__main__":
    main()