med-notes/analyze.py

142 lines
4.1 KiB
Python

#!/usr/bin/env python3
import os
import glob
import re
import zipfile
import subprocess
import xml.etree.ElementTree as ET
import shutil
import json
from pathlib import Path
import difflib
import hashlib
import docx
from docx2pdf import convert
def clean_string(input_string):
# Remove everything up to and including the first numbers and any period that follows,
# then any whitespace
result = re.sub(r"^.*?(\d+)\.?\s*", "", input_string)
# Fix spacing around punctuation (periods, colons, commas)
# Add space after punctuation if followed by non-whitespace
result = re.sub(r"([.,:])\s*([^\s.,:])", r"\1 \2", result)
# Remove space before punctuation
result = re.sub(r"\s+([.,:])", r"\1", result)
# Replace multiple consecutive punctuation marks with a single one
result = re.sub(r"\.{2,}", ".", result)
result = re.sub(r":{2,}", ":", result)
result = re.sub(r",{2,}", ",", result)
# Remove ending dot
result = re.sub(r"\.$", "", result)
# Replace multiple whitespaces with a single space
result = re.sub(r"\s{2,}", " ", result)
result = re.sub(r"^\.?\s*тема:\s*", "", result)
# Trim leading and trailing whitespace
result = result.strip()
return result
def read_first_line_docx(filename):
try:
doc = docx.Document(filename)
# Check if document has any content
if not doc.paragraphs:
return "Document contains no text"
# Look for the first non-empty paragraph
for i, para in enumerate(doc.paragraphs[:5]): # Check first 5 paragraphs
text = para.text.strip()
if text:
# Check if this paragraph has title-like formatting
is_title = False
for run in para.runs:
if run.bold or run.font.size and run.font.size > 12:
is_title = True
break
# Return this paragraph if it's formatted like a title or if it's the first with text
if is_title or i == 0:
# Split by newlines and take first line
first_line = text.split("\n")[0].strip()
return clean_string(first_line)
# If we reach here, use the first paragraph that has any text
for para in doc.paragraphs:
if para.text.strip():
return clean_string(para.text.split("\n")[0].strip())
return "Document contains no text"
except Exception as e:
return f"Error reading document: {str(e)}"
def group_duplicates(items):
grouped = {}
# Group by title
for item in items:
title = item["title"]
if title not in grouped:
grouped[title] = []
grouped[title].append(item)
# Convert to list of groups
result = list(grouped.values())
return result
def main():
text = []
data_dir = "./data"
filenames = os.listdir(data_dir)
for i, filename in enumerate(filenames):
filepath = os.path.join(data_dir, filename)
first_line = read_first_line_docx(filepath)
text.append({"filename": filename, "title": first_line})
final_structure = []
grouped = group_duplicates(text)
for i, g in enumerate(grouped, 1):
final_structure.append(
{
"id": f"T{i:03d}",
"title": g[0]["title"],
"files": [t["filename"].replace("docx", "pdf") for t in g],
}
)
with open("structure.json", "w", encoding="utf-8") as json_file:
json.dump(final_structure, json_file, ensure_ascii=False, indent=4)
def rename_files():
docx_files = glob.glob("./data/*.docx")
# Sort the files to ensure consistent numbering
docx_files.sort()
# Rename each file with a counter
for i, file_path in enumerate(docx_files, 1):
# Create the new filename with 3 digits (001, 002, etc.)
new_filename = f"./data/f{i:03d}.docx"
# Rename the file
os.rename(file_path, new_filename)
print(f"Renamed: {file_path} -> {new_filename}")
if __name__ == "__main__":
main()