commit 9a86fe25e1373da1f78bc7d540afbcc325cdffdc Author: Tomas Mirchev Date: Thu Apr 3 04:16:38 2025 +0000 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6f9f1f1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +data/ +data_bak/ diff --git a/Archive.zip b/Archive.zip new file mode 100644 index 0000000..148ea51 Binary files /dev/null and b/Archive.zip differ diff --git a/README.md b/README.md new file mode 100644 index 0000000..800862c --- /dev/null +++ b/README.md @@ -0,0 +1,20 @@ +# Notes + +filename: `file-{counter}.docx` -> `file-{counter}.pdf` +- group by title + +structure: +``` +{ + topics: [ + { + id: "t111", + title: "some title", + files: [ + "file-11.pdf", + "file-42.pdf" + ] + } + ] +} +``` diff --git a/analyze.py b/analyze.py new file mode 100644 index 0000000..8b5afe2 --- /dev/null +++ b/analyze.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +import os +import glob +import re +import zipfile +import subprocess +import xml.etree.ElementTree as ET +import shutil +import json +from pathlib import Path +import difflib +import hashlib +import docx +from docx2pdf import convert +from prettytable import PrettyTable + + +def clean_string(input_string): + + # Remove everything up to and including the first numbers and any period that follows, + # then any whitespace + result = re.sub(r"^.*?(\d+)\.?\s*", "", input_string) + + # Fix spacing around punctuation (periods, colons, commas) + # Add space after punctuation if followed by non-whitespace + result = re.sub(r"([.,:])\s*([^\s.,:])", r"\1 \2", result) + + # Remove space before punctuation + result = re.sub(r"\s+([.,:])", r"\1", result) + + # Replace multiple consecutive punctuation marks with a single one + result = re.sub(r"\.{2,}", ".", result) + result = re.sub(r":{2,}", ":", result) + result = re.sub(r",{2,}", ",", result) + + # Remove ending dot + result = re.sub(r"\.$", "", result) + + # Replace multiple whitespaces with a single space + result = re.sub(r"\s{2,}", " ", result) + + result = re.sub(r"^\.?\s*тема:\s*", "", result) + + # Trim leading and trailing whitespace + result = result.strip() + + return result + + +def read_first_line_docx(filename): + try: + doc = docx.Document(filename) + + # Check if document has any content + if not doc.paragraphs: + return "Document contains no text" + + # Look for the first non-empty paragraph + for i, para in enumerate(doc.paragraphs[:5]): # Check first 5 paragraphs + text = para.text.strip() + if text: + # Check if this paragraph has title-like formatting + is_title = False + for run in para.runs: + if run.bold or run.font.size and run.font.size > 12: + is_title = True + break + + # Return this paragraph if it's formatted like a title or if it's the first with text + if is_title or i == 0: + # Split by newlines and take first line + first_line = text.split("\n")[0].strip() + return clean_string(first_line) + + # If we reach here, use the first paragraph that has any text + for para in doc.paragraphs: + if para.text.strip(): + return clean_string(para.text.split("\n")[0].strip()) + + return "Document contains no text" + + except Exception as e: + return f"Error reading document: {str(e)}" + + +def group_duplicates(items): + grouped = {} + + # Group by title + for item in items: + title = item["title"] + if title not in grouped: + grouped[title] = [] + grouped[title].append(item) + + # Convert to list of groups + result = list(grouped.values()) + return result + + +def main(): + convert("./data/", "./data_pdf/") + return + table = PrettyTable() + table.field_names = ["id", "filename", "first line"] + + text = [] + + data_dir = "./data_pdf" + filenames = os.listdir(data_dir) + for i, filename in enumerate(filenames): + filepath = os.path.join(data_dir, filename) + first_line = read_first_line_docx(filepath) + text.append({"filename": filename, "title": first_line}) + + final_structure = [] + grouped = group_duplicates(text) + for i, g in enumerate(grouped, 1): + final_structure.append( + { + "id": f"F{i:03d}", + "title": g[0]["title"], + "files": [t["filename"] for t in g], + } + ) + with open("structure.json", "w", encoding="utf-8") as json_file: + json.dump(final_structure, json_file, ensure_ascii=False, indent=4) + + +def rename_files(): + docx_files = glob.glob("./data/*.docx") + + # Sort the files to ensure consistent numbering + docx_files.sort() + + # Rename each file with a counter + for i, file_path in enumerate(docx_files, 1): + # Create the new filename with 3 digits (001, 002, etc.) + new_filename = f"./data/f{i:03d}.docx" + + # Rename the file + os.rename(file_path, new_filename) + print(f"Renamed: {file_path} -> {new_filename}") + + +if __name__ == "__main__": + main() diff --git a/code.py b/code.py new file mode 100644 index 0000000..6111f1b --- /dev/null +++ b/code.py @@ -0,0 +1,313 @@ +#!/usr/bin/env python3 +import os +import re +import zipfile +import subprocess +import xml.etree.ElementTree as ET +import shutil +import json +from pathlib import Path +import difflib +import hashlib + + +def extract_first_line_docx(docx_file): + """Extract the first line of text from a docx file using built-in libraries.""" + try: + # Open the docx file as a zip archive + with zipfile.ZipFile(docx_file) as zip_ref: + # Extract document.xml which contains the main content + xml_content = zip_ref.read("word/document.xml") + + # Parse the XML + root = ET.fromstring(xml_content) + + # Define namespace + namespaces = { + "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + } + + # Find all text elements + text_elements = root.findall(".//w:t", namespaces) + + # Concatenate text from first paragraph with content + paragraph_text = "" + for element in text_elements: + text = element.text + if text and text.strip(): + paragraph_text += text + # If we have some text and hit a line break or period, that's good enough for a first line + if "\n" in paragraph_text or "." in paragraph_text: + break + + # Clean and return the first line if found + if paragraph_text.strip(): + return paragraph_text.strip() + + return "NoTextFound" + + except Exception as e: + print(f"Error reading {docx_file}: {e}") + return "ErrorReadingFile" + + +def extract_first_line_doc(doc_file): + """Extract the first line of text from a doc file using antiword if available, otherwise return a placeholder.""" + try: + # Check if antiword is installed + result = subprocess.run( + ["which", "antiword"], stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + + if result.returncode == 0: + # Use antiword to extract text + result = subprocess.run( + ["antiword", str(doc_file)], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + if result.returncode == 0: + text = result.stdout + # Get the first non-empty line + for line in text.split("\n"): + if line.strip(): + return line.strip() + else: + print(f"Antiword error: {result.stderr}") + else: + # Try catdoc if available + result = subprocess.run( + ["which", "catdoc"], stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + if result.returncode == 0: + result = subprocess.run( + ["catdoc", str(doc_file)], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + if result.returncode == 0: + text = result.stdout + # Get the first non-empty line + for line in text.split("\n"): + if line.strip(): + return line.strip() + else: + print(f"Catdoc error: {result.stderr}") + else: + print( + "Neither antiword nor catdoc is installed. Cannot extract text from .doc files." + ) + return f"Doc_FileNeedsConversion_{Path(doc_file).stem}" + + return "NoTextFound" + + except Exception as e: + print(f"Error reading {doc_file}: {e}") + return "ErrorReadingFile" + + +def clean_title(title): + """Clean title by removing various prefixes and formatting.""" + # Remove leading numbers and dots (like "15 ", "3.", "8", etc.) + title = re.sub(r"^\d+\.?\s*", "", title) + + # More aggressive pattern to match different variations of "Тема: N" or "Тема N" + title = re.sub(r"^[Тт]ема[\s:]*\d+\s*", "", title) + + # Remove "copy" at the end + title = re.sub(r"\s+copy$", "", title) + + # Remove extra spaces + title = re.sub(r"\s+", " ", title).strip() + + return title + + +def calculate_file_hash(filepath): + """Calculate MD5 hash of a file to compare content.""" + hasher = hashlib.md5() + with open(filepath, "rb") as f: + buf = f.read(65536) # Read in 64k chunks + while len(buf) > 0: + hasher.update(buf) + buf = f.read(65536) + return hasher.hexdigest() + + +def find_similar_files(files): + """Group similar files to detect originals and copies.""" + # First pass: group by filename similarity + filename_groups = {} + + for file_path in files: + filename = file_path.name + # Remove "copy" for comparison + base_name = re.sub(r"\s+copy\.[^.]+$", "", filename) + base_name = re.sub(r"\s+copy$", "", base_name) + + if base_name not in filename_groups: + filename_groups[base_name] = [] + filename_groups[base_name].append(file_path) + + # Filter groups to keep only those with multiple files + duplicate_groups = {k: v for k, v in filename_groups.items() if len(v) > 1} + + # Second pass: verify content similarity using hashes + verified_duplicates = {} + + for base_name, file_paths in duplicate_groups.items(): + hashes = {} + for file_path in file_paths: + file_hash = calculate_file_hash(file_path) + if file_hash not in hashes: + hashes[file_hash] = [] + hashes[file_hash].append(file_path) + + # Keep groups with identical content + identical_files = [ + files for hash_val, files in hashes.items() if len(files) > 1 + ] + if identical_files: + verified_duplicates[base_name] = identical_files + + # Also track files with the same name but different content + if len(hashes) > 1: + print( + f"Warning: Files with similar names but different content: {file_paths}" + ) + + return verified_duplicates, duplicate_groups + + +def main(): + # Directory containing the doc/docx files + data_dir = "./data" + + # Create backup and duplicates directories + backup_dir = os.path.join(data_dir, "docs_backup") + duplicates_dir = os.path.join(data_dir, "duplicates") + os.makedirs(backup_dir, exist_ok=True) + os.makedirs(duplicates_dir, exist_ok=True) + + # Get all doc and docx files in the data directory + docs_path = Path(data_dir) + doc_files = list(docs_path.glob("*.doc")) + docx_files = list(docs_path.glob("*.docx")) + all_files = doc_files + docx_files + + if not all_files: + print(f"No doc/docx files found in {data_dir}") + return + + print(f"Found {len(all_files)} documents") + + # First, let's handle duplicates + print("\nChecking for duplicate files...") + verified_duplicates, potential_duplicates = find_similar_files(all_files) + + # Move duplicate files to the duplicates directory + files_to_remove = [] + for base_name, duplicate_groups in verified_duplicates.items(): + for duplicate_group in duplicate_groups: + # Keep the first file, move others to duplicates directory + keeper = duplicate_group[0] + for dupe in duplicate_group[1:]: + print( + f"Moving duplicate: {dupe.name} → duplicates/ (identical to {keeper.name})" + ) + shutil.move(dupe, os.path.join(duplicates_dir, dupe.name)) + files_to_remove.append(dupe) + + # For potential duplicates with "copy" in the name + for base_name, file_paths in potential_duplicates.items(): + # Check if any have "copy" in the name + copy_files = [f for f in file_paths if "copy" in f.name.lower()] + non_copy_files = [f for f in file_paths if "copy" not in f.name.lower()] + + # If we have both originals and copies + if copy_files and non_copy_files: + for copy_file in copy_files: + # Check if we already moved this as a verified duplicate + if copy_file in files_to_remove: + continue + + print( + f"Moving potential duplicate: {copy_file.name} → duplicates/ (has 'copy' in name)" + ) + shutil.move(copy_file, os.path.join(duplicates_dir, copy_file.name)) + files_to_remove.append(copy_file) + + # Remove duplicates from the processing list + for file_path in files_to_remove: + if file_path in all_files: + all_files.remove(file_path) + + # Now let's process the remaining files + print(f"\nProcessing {len(all_files)} unique files...") + + # Sort files by name for consistent processing + all_files.sort(key=lambda x: x.name) + + # Dictionary to store full titles + title_map = {} + + # Process each file + for counter, filepath in enumerate(all_files, 1): + filename = filepath.name + + # Skip files in the backup/duplicates directories + if any(d in str(filepath) for d in ["docs_backup", "duplicates"]): + continue + + print(f"Processing {filename}...") + + # Get first line of content based on file type + if filepath.suffix.lower() == ".docx": + first_line = extract_first_line_docx(filepath) + else: # .doc + first_line = extract_first_line_doc(filepath) + + # Clean the title + clean_full_title = clean_title(first_line) + + # Format counter with leading zeros + counter_str = f"{counter:03d}" + + # Store the full title in the dictionary with counter as key + title_map[counter_str] = clean_full_title + + # Create new filename with requested format: "TXXX.extension" + new_filename = f"T{counter_str}{filepath.suffix}" + new_filepath = filepath.parent / new_filename + + # Create a backup + backup_path = Path(backup_dir) / filename + shutil.copy2(filepath, backup_path) + + print(f"Renaming: {filename} → {new_filename}") + + # Rename the file + filepath.rename(new_filepath) + + # Save the full titles to a JSON file + json_path = os.path.join(data_dir, "document_titles.json") + with open(json_path, "w", encoding="utf-8") as json_file: + json.dump(title_map, json_file, ensure_ascii=False, indent=2) + + print(f"\nAll files renamed. Backups stored in '{backup_dir}' directory.") + print(f"Duplicate files moved to '{duplicates_dir}' directory.") + print(f"Full titles saved to '{json_path}'") + + # Print installation instructions if needed + if doc_files: + print( + "\nNOTE: To process .doc files, you may need to install one of these tools:" + ) + print(" - antiword: sudo apt-get install antiword") + print(" - catdoc: sudo apt-get install catdoc") + + +if __name__ == "__main__": + main() diff --git a/structure.json b/structure.json new file mode 100644 index 0000000..0e14206 --- /dev/null +++ b/structure.json @@ -0,0 +1,466 @@ +[ + { + "id": "F001", + "title": "Предмет и задачи на вътрешната медицина. Раздели на вътрешните болести", + "files": [ + "f001.docx" + ] + }, + { + "id": "F002", + "title": "Тумори на белия дроб", + "files": [ + "f002.docx" + ] + }, + { + "id": "F003", + "title": "Белодробен тромбоемболизъм", + "files": [ + "f003.docx", + "f096.docx" + ] + }, + { + "id": "F004", + "title": "Плеврити", + "files": [ + "f004.docx", + "f097.docx" + ] + }, + { + "id": "F005", + "title": "Белодробна туберколоза-етиология, патогенеза, и клинична картина. Първична и вторична ТБК. Лечение и профилактика на туберкулоза", + "files": [ + "f005.docx", + "f098.docx" + ] + }, + { + "id": "F006", + "title": "Дихателна недостатъчност", + "files": [ + "f006.docx" + ] + }, + { + "id": "F007", + "title": "Основни симптоми и синдроми при заболявания на ССС Физикални и специални методи на изследване на ССС", + "files": [ + "f007.docx", + "f031.docx" + ] + }, + { + "id": "F008", + "title": "Болест и здраве. Етилогия и патогенеза на болестите. Периоди на болестта", + "files": [ + "f008.docx" + ] + }, + { + "id": "F009", + "title": "Болести на надбъбречните жлези: Хиперкортицизъм. Хипокортицизъм", + "files": [ + "f009.docx", + "f047.docx" + ] + }, + { + "id": "F010", + "title": "Агония. Клинична смърт. Биологична смърт", + "files": [ + "f010.docx" + ] + }, + { + "id": "F011", + "title": "Физикални методи на изследване на стомашно-чревния тракт", + "files": [ + "f011.docx" + ] + }, + { + "id": "F012", + "title": "Основни симптоми и синдроми при заболявания на отделителната система. Функционално изследване на ОС", + "files": [ + "f012.docx" + ] + }, + { + "id": "F013", + "title": "Основни класически методи на изследване във вътрешната медицина-анамнеза. Физикални методи на изследване на болните –оглед, палпация, перкусия, аускултация. Специални методи на изследване на пациентите", + "files": [ + "f013.docx" + ] + }, + { + "id": "F014", + "title": "Деформираща артроза ОСТЕОАРТРОЗА", + "files": [ + "f014.docx" + ] + }, + { + "id": "F015", + "title": "Сегашно състояние-обективен статус на болния. Клинична диагноза и прогноза. Проследяване на болния-декурзус", + "files": [ + "f015.docx" + ] + }, + { + "id": "F016", + "title": "Изследване на ДС. Основни симптоми и синдроми на заболявания на ДС. Физиклани и спциални методи на изследване", + "files": [ + "f016.docx" + ] + }, + { + "id": "F017", + "title": "Остър и хроничен бронхит. Белодробен емфизем. ХОББ", + "files": [ + "f017.docx", + "f057.docx" + ] + }, + { + "id": "F018", + "title": "Пневмонии: класификация, клиника, лечение", + "files": [ + "f018.docx", + "f075.docx" + ] + }, + { + "id": "F019", + "title": "Бронхиектазии. Белодробен абцес", + "files": [ + "f019.docx" + ] + }, + { + "id": "F020", + "title": "Хипертонична болест- рискови фактори;патогенеза;клиника, лечение", + "files": [ + "f020.docx", + "f035.docx" + ] + }, + { + "id": "F021", + "title": "Изследване на стомашно-чревния тракт. Анемнеза. Основни симптоми и синдроми при заболявания на стомашно-чревния тракт", + "files": [ + "f021.docx", + "f054.docx" + ] + }, + { + "id": "F022", + "title": "Пиелонефрити", + "files": [ + "f022.docx", + "f074.docx" + ] + }, + { + "id": "F023", + "title": "Хемолитични анемии вследствие на вътре и извънеритроцитни фактори, вродени и придобити", + "files": [ + "f023.docx", + "f084.docx" + ] + }, + { + "id": "F024", + "title": "Хеморагични диатези – хемофилия, есенциална тромбоцитопения, капиляротоксикоза", + "files": [ + "f024.docx", + "f091.docx" + ] + }, + { + "id": "F025", + "title": "Бластна левкоза. Хронична миелолевкоза", + "files": [ + "f025.docx", + "f087.docx" + ] + }, + { + "id": "F026", + "title": "Нехочкинови и хочкинови лимфоми", + "files": [ + "f026.docx", + "f089.docx" + ] + }, + { + "id": "F027", + "title": "Остри екзогенни интоксикации. Общи принципи и правила в лечението на острите екзогенни отравяния. Поведение на медицинската сестра и грижи за болния с остро отравяне", + "files": [ + "f027.docx", + "f093.docx" + ] + }, + { + "id": "F028", + "title": "Алергия. Алергични заболявания. Анафилактичен шок. Поведение на медицинската сестра при спешни алергични състояния", + "files": [ + "f028.docx", + "f094.docx" + ] + }, + { + "id": "F029", + "title": "Основни класически методи на изследване във вътрешната медицина-анамнеза. Физикални методи за изледване на болните-оглед, палпация, перкусия, аускултация. Сегашно състояние-обективен статус на болния. Клинична диагноза и прогноза. Проследяване на болния-Декурзус", + "files": [ + "f029.docx" + ] + }, + { + "id": "F030", + "title": "Дихателна недостатъчност- остра и хронична. Етиология, степени, клиника", + "files": [ + "f030.docx" + ] + }, + { + "id": "F031", + "title": "Ревматизъм", + "files": [ + "f032.docx", + "f034.docx" + ] + }, + { + "id": "F032", + "title": "Ендокардити, перикардити", + "files": [ + "f033.docx", + "f039.docx" + ] + }, + { + "id": "F033", + "title": "Остра периферна сърдечно-съдова недостатъчност. Кардиологичен шок", + "files": [ + "f036.docx" + ] + }, + { + "id": "F034", + "title": "Лечение на СН и поведение на м. с", + "files": [ + "f037.docx" + ] + }, + { + "id": "F035", + "title": "Ритъмни нарушения на сърдечната дейност. Проводни нарушения на сърдечната дейност", + "files": [ + "f038.docx", + "f044.docx" + ] + }, + { + "id": "F036", + "title": "ИБС: етиология, рискови фактори, патофизиология. Стенокардия", + "files": [ + "f040.docx", + "f045.docx" + ] + }, + { + "id": "F037", + "title": "Изследване на ДС. Основни симптоми и синдроми при заболявания на ДС. Физикални и специаални методи на изследване на ДС", + "files": [ + "f041.docx" + ] + }, + { + "id": "F038", + "title": "ИБС: етиология, рискови фактори, патофизиология. Инфаркт на миокарда", + "files": [ + "f042.docx", + "f046.docx" + ] + }, + { + "id": "F039", + "title": "Болести на хипофизата- Акромегалия ;Безвкусен диабет", + "files": [ + "f043.docx", + "f048.docx" + ] + }, + { + "id": "F040", + "title": "Захарен диабет-етиология, патогенеза, класификация, клиника. Диабетна кетоацидоза и хипокликемична кома. Поведение на МС при диабетно болен в кома", + "files": [ + "f049.docx" + ] + }, + { + "id": "F041", + "title": "Болести на щитовидната жлеза: Тиреотоксикоза. Микседем. Ендемична гуша", + "files": [ + "f050.docx" + ] + }, + { + "id": "F042", + "title": "Захарен диабет – късни усложнения. Захарен диабет- диета и медикаментозно лечение", + "files": [ + "f051.docx", + "f056.docx" + ] + }, + { + "id": "F043", + "title": "Затлъстяване. Подагра", + "files": [ + "f052.docx", + "f053.docx" + ] + }, + { + "id": "F044", + "title": "Физикални и специални методи на изследване на стомашно-чревния тракт", + "files": [ + "f055.docx" + ] + }, + { + "id": "F045", + "title": "Гастрити. ГЕРБ", + "files": [ + "f058.docx", + "f061.docx" + ] + }, + { + "id": "F046", + "title": "Язвена болест. Рак на стомаха", + "files": [ + "f059.docx", + "f063.docx" + ] + }, + { + "id": "F047", + "title": "Ентерити и колити. Рак на дебелото черво", + "files": [ + "f060.docx", + "f065.docx" + ] + }, + { + "id": "F048", + "title": "Основни симптоми и синдроми при заболяване на черния дроб и жлъчните пътища. Жълтеница, портална хипертония, асцит. Анамнеза, физикални и специални методи за изследване на черния дроб и жлъчните пътища", + "files": [ + "f062.docx" + ] + }, + { + "id": "F049", + "title": "Хронични хепатити. Чернодробна цироза", + "files": [ + "f064.docx", + "f071.docx" + ] + }, + { + "id": "F050", + "title": "Холелитиаза. Холецистити", + "files": [ + "f066.docx", + "f073.docx" + ] + }, + { + "id": "F051", + "title": "Основни симптоми и синдроми при заболяване на черния дроб и жлъчните пътища. Анамнеза, физикални и специални методи за изледване на черния дроб и жлъчните пътища", + "files": [ + "f067.docx", + "f068.docx" + ] + }, + { + "id": "F052", + "title": "Остър и хроничен гломерулонефрит", + "files": [ + "f069.docx", + "f070.docx" + ] + }, + { + "id": "F053", + "title": "Нефролитиаза", + "files": [ + "f072.docx", + "f077.docx" + ] + }, + { + "id": "F054", + "title": "Остра бъбречна и хронична бъбречна недостатъчност", + "files": [ + "f076.docx" + ] + }, + { + "id": "F055", + "title": "Балканска ендемична нефропатия. Бъбречна поликистозна болест. Бъбречна туберкулоза", + "files": [ + "f078.docx", + "f082.docx" + ] + }, + { + "id": "F056", + "title": "Ревмтоиден артрит", + "files": [ + "f079.docx", + "f085.docx" + ] + }, + { + "id": "F057", + "title": "Деформираща артроза", + "files": [ + "f080.docx", + "f081.docx" + ] + }, + { + "id": "F058", + "title": "Желязодефицитни анемии", + "files": [ + "f083.docx", + "f088.docx" + ] + }, + { + "id": "F059", + "title": "Витамин В 12 – дефицитни анемии", + "files": [ + "f086.docx", + "f090.docx" + ] + }, + { + "id": "F060", + "title": "Бронхоектазии. Белодробен абцес", + "files": [ + "f092.docx" + ] + }, + { + "id": "F061", + "title": "Tумори на белия дроб", + "files": [ + "f095.docx" + ] + } +] \ No newline at end of file