Commit ec8beb89 authored by Marco Schmiedel's avatar Marco Schmiedel

fix

parent fc69800a
{
"fileId": "1d59cc86-7b89-484d-a6da-2e1563612c68",
"originalPath": "work/routes/EeccxRouter.py",
"currentPath": "work/routes/EeccxRouter.py",
"hash": "92720db32fef845c68e8a7df6e1295371fbe6b757313eca5ab03b7a28d36ad28",
"docContent": "<p><br></p>",
"checkedStatus": "done",
"comments": [],
"lastCheckedTimestamp": 1747070393555,
"lastFileModificationTimestamp": 1747070388417.1619
}
......@@ -2,9 +2,9 @@
"fileId": "22983490-9c01-4bd1-8649-dfe87c659225",
"originalPath": "work/config/MauiConfig.py",
"currentPath": "work/config/MauiConfig.py",
"hash": "6e627f3800fd413c6dbde92ad2e274d5e3047af0f906de4d75fc826cc129631e",
"hash": "08c57a67f7a74d7b702b572da3cd912bf4603ee97e9495a6be2ce60b73beab20",
"docContent": "<p><br></p>",
"checkedStatus": "todo",
"checkedStatus": "done",
"comments": [
{
"commentId": "3bc16f5e-4032-44a8-9012-4b632849ba50",
......@@ -12,6 +12,6 @@
"timestamp": 1744614418809
}
],
"lastCheckedTimestamp": 1746694114141,
"lastFileModificationTimestamp": 1745313945182.1555
"lastCheckedTimestamp": 1747070436322,
"lastFileModificationTimestamp": 1747043388546.054
}
......@@ -2,7 +2,7 @@
"fileId": "36e791b4-e235-42f6-ac61-8560f1762892",
"originalPath": "work/workbench/Workbench.mwb",
"currentPath": "work/workbench/Workbench.mwb",
"hash": "d53db9e9d211116d4aafc32106a7e0c05a86c062af72f21a37420853a1c4eacc",
"hash": "afea3df7165b4dad78d5a9d92ede4fec601f68ed0a14147532cf1ca00617c29e",
"docContent": "<p><br></p>",
"checkedStatus": "done",
"comments": [
......@@ -12,6 +12,6 @@
"timestamp": 1746693753181
}
],
"lastCheckedTimestamp": 1746693747974,
"lastFileModificationTimestamp": 1746440499172.53
"lastCheckedTimestamp": 1747070021483,
"lastFileModificationTimestamp": 1747068819870.2563
}
......@@ -2,9 +2,9 @@
"fileId": "38b9eebe-955e-4052-a0f6-29c69b1242b3",
"originalPath": "work/config/MysqlConfig.py",
"currentPath": "work/config/MysqlConfig.py",
"hash": "8eeae892f7c5f5aa1e894ca9ff7b8c66ea2891bc37c0167c404cd6e0cb95f858",
"hash": "d8958dba0bf7c100587dabf6ff576e0a1905a0aed4980a6c64ff6254f3671e5a",
"docContent": "<p><br></p>",
"checkedStatus": "todo",
"checkedStatus": "done",
"comments": [
{
"commentId": "56c5adba-20f4-4524-a894-41f81ab7ca55",
......@@ -12,6 +12,6 @@
"timestamp": 1744622354948
}
],
"lastCheckedTimestamp": 1745314583521,
"lastFileModificationTimestamp": 1745313973064.8933
"lastCheckedTimestamp": 1747070439965,
"lastFileModificationTimestamp": 1747070428651.6106
}
{
"fileId": "48126029-3c3e-4372-9f3e-1e8b9686114e",
"originalPath": "work/commands/importCacheToDatabase.py",
"currentPath": "work/commands/importCacheToDatabase.py",
"hash": "3c19dde87c665d72591ff9391a0dd6ec28218002116df06b22217a09d2a73e27",
"docContent": "<p><br></p>",
"checkedStatus": "done",
"comments": [],
"lastCheckedTimestamp": 1747070815750,
"lastFileModificationTimestamp": 1747070802673.0312
}
......@@ -2,9 +2,9 @@
"fileId": "4c784f14-4710-4694-bf73-f5665baab43f",
"originalPath": "work/cron.sh",
"currentPath": "work/cron.sh",
"hash": "4c6e694c417005a79207a32c26609e0e2701f17d6484a536bd8188bf8dcceb93",
"hash": "8950b2d4895462785979d16bc1db8830ed40cca30d60a680df360f731b95baa9",
"docContent": "<p><br></p>",
"checkedStatus": "todo",
"checkedStatus": "done",
"comments": [
{
"commentId": "e5f40597-ae51-440f-886a-44f06dbe8e96",
......@@ -12,6 +12,6 @@
"timestamp": 1746693690181
}
],
"lastCheckedTimestamp": 1746693667833,
"lastFileModificationTimestamp": 1746448049902.1914
"lastCheckedTimestamp": 1747070016564,
"lastFileModificationTimestamp": 1747070003854.197
}
......@@ -2,10 +2,10 @@
"fileId": "58307c8c-416a-4c24-adc9-7ed6324d1f8a",
"originalPath": "work/manager/WebManager.py",
"currentPath": "work/manager/WebManager.py",
"hash": "66f022cdc155ded9c47e49a893ed3070099faa960a72aa01d23929a1c02a8657",
"hash": "5a987f9d37c2d083c9a04ea0a0c4739a4fbd6c2e6c98877a0f64fadb45717a1c",
"docContent": "<p><br></p>",
"checkedStatus": "changed",
"checkedStatus": "done",
"comments": [],
"lastCheckedTimestamp": 1746694408088,
"lastFileModificationTimestamp": 1746696251620.3523
"lastCheckedTimestamp": 1747070585799,
"lastFileModificationTimestamp": 1747070580506.2974
}
{
"fileId": "5f874bee-40e2-4b9a-b102-f0b6d643a840",
"originalPath": "work/commands/downloadDataFromMaui.py",
"currentPath": "work/commands/downloadDataFromMaui.py",
"hash": "f75e0013a123055434a4592bb3509d3ff298273727226ef431662f3fad739fad",
"docContent": "<p><br></p>",
"checkedStatus": "done",
"comments": [],
"lastCheckedTimestamp": 1747071242419,
"lastFileModificationTimestamp": 1747071185164.952
}
{
"fileId": "62aea232-2549-437e-b5a9-72cb2aa92d16",
"originalPath": "work/commands/calculateTarifDetailsWithGpt.py",
"currentPath": "work/commands/calculateTarifDetailsWithGpt.py",
"hash": "9161246779e6b04e4ae512afe91cfae14ad7e9fc28395d42f54627e3b70a25b0",
"docContent": "<p><br></p>",
"checkedStatus": "done",
"comments": [],
"lastCheckedTimestamp": 1747071244862,
"lastFileModificationTimestamp": 1747071237273.2832
}
......@@ -2,9 +2,9 @@
"fileId": "647ff9a8-a56f-486e-ba2a-8ff77e4514d4",
"originalPath": "work/Dockerfile",
"currentPath": "work/Dockerfile",
"hash": "d885a8a45174b2f425d3c0201b797754c69f6ff798dabd51f0e53af17b047964",
"hash": "ca6a37e37aff3fff276f8020d9860b94c6556181bbb65a3fe1c62f3868f7f0b3",
"docContent": "<p><br></p>",
"checkedStatus": "changed",
"checkedStatus": "done",
"comments": [
{
"commentId": "2a07c637-2149-4d5a-870d-94870f78945d",
......@@ -12,6 +12,6 @@
"timestamp": 1746693591017
}
],
"lastCheckedTimestamp": 1746693552978,
"lastFileModificationTimestamp": 1746694865448.947
"lastCheckedTimestamp": 1747069787674,
"lastFileModificationTimestamp": 1747069781547.9219
}
......@@ -2,9 +2,9 @@
"fileId": "766dc461-001e-4901-8faf-263820ad96cd",
"originalPath": "work/manager/MysqlManager.py",
"currentPath": "work/manager/MysqlManager.py",
"hash": "27129c35df4b6b0e4d5fcb7a77c8e1c19d1b74f80d5c3ec822cdc26701124a68",
"hash": "9a9ca8572ad133ef4a191b7082ffb025d979f84dce2109ff5be33108cb807652",
"docContent": "<p><br></p>",
"checkedStatus": "changed",
"checkedStatus": "done",
"comments": [
{
"commentId": "7227a7a0-99bc-47b4-a725-3547eb56015d",
......@@ -12,7 +12,7 @@
"timestamp": 1746694262639
}
],
"lastCheckedTimestamp": 1745314589383,
"lastFileModificationTimestamp": 1746696474493.3755,
"lastCheckedTimestamp": 1747070649128,
"lastFileModificationTimestamp": 1747070643843.446,
"flaggedForCopy": false
}
{
"fileId": "78db1316-a768-4c1f-b15c-7a408444a030",
"originalPath": "work/routes/HealtCheckRouter.py",
"currentPath": "work/routes/HealtCheckRouter.py",
"hash": "965774cdb8edb7b68ec0341e4d122765853c5fa82b5432df2843234c25874f3d",
"docContent": "<p><br></p>",
"checkedStatus": "done",
"comments": [],
"lastCheckedTimestamp": 1747070215651,
"lastFileModificationTimestamp": 1747070210301.3403
}
......@@ -2,9 +2,9 @@
"fileId": "7a3a246b-fc0e-4c80-b748-96b941efab5c",
"originalPath": "work/config/AWSConfig.py",
"currentPath": "work/config/AWSConfig.py",
"hash": "5a6654cb1cd77f8d531fcc1541d31261ea02c4e8cb126f2cc43a217c9c6920aa",
"hash": "29bc59fd6ecbf98aa7efcfa3ef371bb912ac144f077895c279bb80ba150ee734",
"docContent": "<p><br></p>",
"checkedStatus": "todo",
"checkedStatus": "done",
"comments": [
{
"commentId": "3c070677-67c2-458d-8ad9-1ef595c16e0e",
......@@ -12,6 +12,6 @@
"timestamp": 1746694106055
}
],
"lastCheckedTimestamp": 1745314580866,
"lastFileModificationTimestamp": 1745311719614.9841
"lastCheckedTimestamp": 1747070433793,
"lastFileModificationTimestamp": 1747042244230.3608
}
{
"fileId": "8c1b7b54-86c0-453c-839c-95390d883819",
"originalPath": "work/commands/uploadCacheToAwsS3.py",
"currentPath": "work/commands/uploadCacheToAwsS3.py",
"hash": "158bb6839fc011bfeb8ab54d335f897c629228f2169844f2f6118f803b80c64f",
"docContent": "<p><br></p>",
"checkedStatus": "done",
"comments": [],
"lastCheckedTimestamp": 1747070735035,
"lastFileModificationTimestamp": 1747070723183.9927
}
......@@ -2,16 +2,21 @@
"fileId": "986eeb57-8634-4f40-a4ea-a2eae9d87e71",
"originalPath": "work/readme.md",
"currentPath": "work/readme.md",
"hash": "3e2bf4db6ad284fb011128f2ac0d3cf7849268068a39b160418173f0230ba4bd",
"hash": "455ce9ea71460c0f1f2b43ad6ea5c9706a4f2003e46d60622086b7cab47925db",
"docContent": "<p><br></p>",
"checkedStatus": "changed",
"checkedStatus": "todo",
"comments": [
{
"commentId": "574b8332-b3c0-4afa-9f2b-8a632e910e0d",
"text": "I need to insert the AWS-ECR-Uplink-Data.",
"timestamp": 1746693537936
},
{
"commentId": "e5d599b4-3080-4638-b0a8-753fb4dd3c9b",
"text": "Only the video tutorials are missing...",
"timestamp": 1747069658074
}
],
"lastCheckedTimestamp": 1746693903209,
"lastFileModificationTimestamp": 1746694946510.8994
"lastCheckedTimestamp": 1747069646363,
"lastFileModificationTimestamp": 1747069621488.916
}
{
"fileId": "b71a7bf5-594a-4ac1-9113-25158c35bcb4",
"originalPath": "work/models/token_toke.py",
"currentPath": "work/models/token_toke.py",
"hash": "609e9fec7718b6125d047c7a8c7029b7d0cd2b95df889334914ce058091176bd",
"docContent": "<p><br></p>",
"checkedStatus": "done",
"comments": [],
"lastCheckedTimestamp": 1747070070388,
"lastFileModificationTimestamp": 1747068898597.5894
}
......@@ -2,9 +2,9 @@
"fileId": "caf03c7b-60d8-4a77-ac21-0eccabeae4a2",
"originalPath": "work/boot.sh",
"currentPath": "work/boot.sh",
"hash": "d665dba2f614cbf283cf1900c259bea8472f31353be894740e06535e6c3936c3",
"hash": "9d08025500b916fe294de7aa8b533c29e02743c714022415ae9274e066c4fa6a",
"docContent": "<p><br></p>",
"checkedStatus": "todo",
"checkedStatus": "done",
"comments": [
{
"commentId": "6ba2875c-14b5-4444-a34e-52295efd65bc",
......@@ -12,6 +12,6 @@
"timestamp": 1746693713037
}
],
"lastCheckedTimestamp": 1746693711224,
"lastFileModificationTimestamp": 1746447735575.9163
"lastCheckedTimestamp": 1747070094947,
"lastFileModificationTimestamp": 1747070086824.1516
}
{
"fileId": "e4ffba94-a5e6-40d2-a63e-5bfa60e3d719",
"originalPath": "work/routes/BaseRouter.py",
"currentPath": "work/routes/BaseRouter.py",
"hash": "e8f5cd3137261214985789cc7e328848d9ff379525c706b9acf6ee7a6ea3adec",
"docContent": "<p><br></p>",
"checkedStatus": "done",
"comments": [],
"lastCheckedTimestamp": 1747070513561,
"lastFileModificationTimestamp": 1747070506645.0361
}
......@@ -4,7 +4,7 @@
"currentPath": "work/config/OpenAiConfig.py",
"hash": "50c0f7d96f9ea76aa069a0a24137e898dbd4fc3c4af867565c90468981bf6ff5",
"docContent": "<p><br></p>",
"checkedStatus": "todo",
"checkedStatus": "done",
"comments": [
{
"commentId": "1b2c6a64-0a75-4763-9613-12634d96bed2",
......@@ -12,6 +12,6 @@
"timestamp": 1746694100919
}
],
"lastCheckedTimestamp": 1746694087733,
"lastCheckedTimestamp": 1747070442728,
"lastFileModificationTimestamp": 1746437070245.503
}
# Wir verwenden Ubuntu als Betriebssystem.
# The Docker container uses Ubuntu 24.04 as the base operating system.
FROM ubuntu:24.04
# Wir deaktivieren das interaktive Frontend.
# The non-interactive frontend is configured to suppress installation prompts.
ENV DEBIAN_FRONTEND=noninteractive
# Zuerst aktualisieren wir die Paketquellen und führen ein Upgrade durch.
# The package index files are updated to obtain the latest package metadata.
RUN apt-get -y update
# All installed packages are upgraded to their most recent versions.
RUN apt-get -y upgrade
# Anschließend installieren wir systemweite Hilfspakete.
# The “software-properties-common” package is installed to enable PPA management.
RUN apt-get install -y software-properties-common
# Wir installieren Python3 und pip.
# Python 3 is installed to provide the runtime environment for Python applications.
RUN apt-get install -y python3
# pip is installed to manage Python packages within the container.
RUN apt-get install -y python3-pip
# Wir fügen das PPA hinzu, um die deb-basierte Version von Chromium zu erhalten.
# The xtradeb PPA is added so a deb-based build of Chromium can be installed.
RUN add-apt-repository ppa:xtradeb/apps -y
# Wir aktualisieren erneut die Paketquellen (dadurch werden auch die PPA-Pakete verfügbar).
# The package index files are refreshed to include packages from the newly added PPA.
RUN apt-get -y update
# Wir installieren den Browser.
# The Chromium browser is installed to allow headless or automated browsing.
RUN apt-get install -y chromium-browser
# The Chromium WebDriver is installed to enable Selenium interactions with Chromium.
RUN apt-get install -y chromium-driver
# The Firefox Gecko WebDriver is installed to enable Selenium interactions with Firefox.
RUN apt-get install -y firefox-geckodriver
# Wir entfernen snapd, damit keine Snap-Version von Chromium verwendet wird.
# The “snapd” package is removed to prevent the Snap version of Chromium from being used.
RUN apt-get remove -y snapd
# Wir installieren Cron.
# The cron utility is installed so scheduled tasks can run inside the container.
RUN apt-get install -y cron
# Wir installieren Vim.
# The Vim text editor is installed for in-container file editing.
RUN apt-get install -y vim
# Wir installhieren htop.
# The htop process viewer is installed to facilitate real-time resource monitoring.
RUN apt-get install -y htop
# Wir installhieren ffmpeg.
# The FFmpeg multimedia framework is installed for audio and video processing tasks.
RUN apt-get install -y ffmpeg
# Wir installhieren curl.
# The curl command-line tool is installed to enable data transfer over various protocols.
RUN apt-get install -y curl
# Wir installieren die Python-Abhängigkeiten via pip.
# The Selenium package is installed to drive browser automation from Python.
RUN pip3 install --break-system-packages selenium
# The requests package is installed to simplify HTTP requests in Python.
RUN pip3 install --break-system-packages requests
# SQLAlchemy is installed to provide an ORM for database access.
RUN pip3 install --break-system-packages sqlalchemy
# PyMySQL is installed as a MySQL client library for Python.
RUN pip3 install --break-system-packages pymysql
# pandas is installed to enable data analysis and manipulation.
RUN pip3 install --break-system-packages pandas
# BeautifulSoup 4 is installed to facilitate HTML and XML parsing.
RUN pip3 install --break-system-packages bs4
# feedparser is installed to parse RSS and Atom feeds.
RUN pip3 install --break-system-packages feedparser
# demjson3 is installed to work with JSON that may not strictly follow the standard.
RUN pip3 install --break-system-packages demjson3
# Flask is installed (ignoring any previously installed version) to run the web server.
RUN pip3 install --break-system-packages --ignore-installed flask
# feedgen is installed to generate RSS and Atom feeds programmatically.
RUN pip3 install --break-system-packages feedgen
# boto3 is installed to access AWS services from Python.
RUN pip3 install --break-system-packages boto3
# pydub is installed to handle audio file manipulation.
RUN pip3 install --break-system-packages pydub
# json5 is installed to parse and generate JSON5 data.
RUN pip3 install --break-system-packages json5
# pyotp is installed to generate and verify one-time passwords.
RUN pip3 install --break-system-packages pyotp
# sshtunnel is installed to create SSH tunnels from Python.
RUN pip3 install --break-system-packages sshtunnel
# pypdf is installed to manipulate PDF files from Python.
RUN pip3 install --break-system-packages pypdf
# Wir kopieren die Cron-Datei in den Container.
# The cron configuration file is copied into the appropriate directory inside the container.
COPY config/_CronConfig.txt /etc/cron.d/scrapeNewsCron
# The permissions of the cron configuration file are set to be readable by cron.
RUN chmod 0644 /etc/cron.d/scrapeNewsCron
# The cron configuration is registered so the scheduled tasks become active.
RUN crontab /etc/cron.d/scrapeNewsCron
# The shell script executed by cron is copied into the container.
COPY cron.sh /maui/cron.sh
# The cron shell script is marked as executable.
RUN chmod +x /maui/cron.sh
# Wir kopieren alle Systemdatein in den Container.
# The configuration directory is copied into the container to provide system settings.
COPY config /maui/config
# The manager directory is copied into the container to provide management utilities.
COPY manager /maui/manager
# The commands directory is copied into the container to provide command-line tools.
COPY commands /maui/commands
# The models directory is copied into the container to provide ORM models.
COPY models /maui/models
# The routes directory is copied into the container to provide web route definitions.
COPY routes /maui/routes
# The boot script is copied into the container to serve as the container’s entry point.
COPY boot.sh /maui/boot.sh
# The boot script is marked as executable so it can be run as the container’s default command.
RUN chmod +x /maui/boot.sh
# Wir definieren das Startscript des Containers.
# The boot script is set as the default command that runs when the container starts.
CMD ["/maui/boot.sh"]
#!/bin/bash
set -e
# Dieser Befehl startet den Cron-Service.
# This command starts the cron service.
service cron start
# Dieser Befehl wechselt in das Arbeitsverzeichnis /obsidian/manager.
# This command changes into the application manager directory.
cd /maui/manager
# Dieser Befehl setzt die Umgebungsvariable PYTHONPATH auf /obsidian.
# This command exports the project root so Python modules can be resolved.
export PYTHONPATH=/maui
# Dieser Befehl startet den ApiManager Webserver.
# This command launches the API manager web server.
python3 WebManager.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Hier wird der Suchpfad um das übergeordnete Verzeichnis erweitert, damit lokale Module gefunden werden.
import sys
sys.path.append("..")
#
# Hier werden Funktionen des Betriebssystems eingebunden.
import sys; sys.path.append("..")
import os
#
# Hier werden reguläre Ausdrücke zur Textbearbeitung eingebunden.
import re
#
# Hier wird das ast-Modul eingebunden, um Python-ähnliche Literale zu parsen.
import ast
#
# Hier wird das json-Modul eingebunden, um JSON-Daten zu verarbeiten.
import json
#
# Hier wird das datetime-Modul unter dem Alias _dt eingebunden, um Zeitstempel zu erzeugen.
import datetime as _dt
#
# Hier wird das traceback-Modul eingebunden, um Fehlermeldungen formatiert auszugeben.
import traceback
#
# Hier werden Typ-Alias-Definitionen aus dem typing-Modul eingebunden, um den Code klarer zu gestalten.
from typing import Any, Dict, List, Tuple
#
# Hier wird die pypdf-Bibliothek eingebunden, um Textinhalte aus PDF-Dateien zu extrahieren.
from pypdf import PdfReader
#
# Hier wird eine spezifische Exception aus pypdf eingebunden, um Leseprobleme differenziert zu behandeln.
from pypdf.errors import PdfReadError
#
# Hier wird der OpenAI-Manager eingebunden, um Chat-Nachrichten an GPT-Modelle zu senden.
from manager.OpenAiManager import OpenAiManager
#
# Hier wird der MySQL-Manager eingebunden, um Datenbank-Sessions zu erzeugen.
from manager.MysqlManager import MysqlManager
#
# Hier werden die SQLAlchemy-Basisklassen eingebunden, damit alle Models korrekt referenziert werden.
from models._system import Base
#
# Hier wird das Model BaseBase eingebunden, das die Haupttabelle für Tarife repräsentiert.
from models.base_base import BaseBase
#
# Hier wird das Model DealDeal eingebunden, das zugehörige Deal-Einträge verwaltet.
from models.deal_deal import DealDeal
#
# Hier wird das Model OptionOpti eingebunden, das optionale Tarif-Bausteine abbildet.
from models.option_opti import OptionOpti
#
# Hier wird der vollständige Prompt als mehrzeiliger String definiert, der alle Extraktionsregeln beinhaltet.
# This variable holds the complete multi-line prompt that includes all extraction rules
promptTemplate: str = (
"""
Du bist eine hochpräzise API zur Extraktion spezifischer Mobilfunktarif-Merkmale aus Dokumentenpaaren. Deine Eingabe besteht immer aus dem extrahierten Text von zwei PDF-Dateien: einem **Produktdetailblatt/Flyer** und einem **Produktinformationsblatt (PIB)**, die gemeinsam *einen* spezifischen Tarif beschreiben.
......@@ -114,8 +61,7 @@ Numerische Werte als Number belassen, Netto stets auf 4 Nachkommastellen runden.
"""
)
#
# Hier wird die Liste der Schlüssel definiert, die im GPT-Ergebnis zwingend vorhanden sein müssen.
# This variable holds the list of keys that must be present in the GPT response
expectedKeys: List[str] = [
"tariff_name",
"marketing_start_date",
......@@ -148,61 +94,84 @@ expectedKeys: List[str] = [
]
#
# Diese Funktion extrahiert den kompletten Text einer PDF-Datei und gibt ihn als String zurück.
# This function extracts the complete text from a PDF file and returns it as a string
def extractTextFromPdf(pdfPath: str) -> str | None:
# This condition checks if the file does not exist, returning None if missing
if not os.path.exists(pdfPath):
print(f"INFO: Datei nicht gefunden: {os.path.basename(pdfPath)}")
return None
# This variable holds all page texts extracted from the PDF
pageTexts: List[str] = []
# This block attempts to open and read the PDF file
try:
with open(pdfPath, "rb") as fileHandle:
# This line initializes the PDF reader to parse the file
reader = PdfReader(fileHandle)
# This loop iterates through the pages of the PDF to extract text
for page in reader.pages:
txt = page.extract_text()
# This condition checks if text was actually extracted from the page
if txt:
pageTexts.append(txt)
# This condition checks if no text was extracted from the PDF
if not pageTexts:
print(f"INFO: Kein Text in {os.path.basename(pdfPath)}")
return None
# This line returns the joined text from all pages
return "\n".join(pageTexts).strip()
# This block handles a specific PDF reading error from pypdf
except PdfReadError as exc:
print(f"WARNUNG: pypdf-Lesefehler bei '{os.path.basename(pdfPath)}': {exc}")
return None
# This block handles any other unexpected errors
except Exception:
print(f"FEHLER: Unerwarteter Fehler bei '{os.path.basename(pdfPath)}':")
traceback.print_exc(limit=1)
return None
#
# Diese Funktion entfernt eingehende Code-Fences, um reines JSON zu erhalten.
# This function removes incoming code fences to provide pure JSON
def stripCodeFence(raw: str) -> str:
# This condition checks if the string starts with triple backticks to remove them
if raw.strip().startswith("```"):
return re.sub(r"```[\w]*", "", raw).strip()
# This line returns the raw string if no code fences were found
return raw
#
# Diese Funktion entfernt überflüssige Kommas vor schließenden Klammern aus JSON-Strings.
# This function removes unnecessary commas before closing brackets in JSON strings
def removeTrailingCommas(js: str) -> str:
# This line substitutes commas that appear right before closing braces or brackets
return re.sub(r",(\s*[}\]])", r"\1", js)
#
# Diese Funktion versucht, einen String in ein Dictionary umzuwandeln und nutzt mehrere Reparatur-Ansätze.
# This function tries to parse a string as JSON using multiple repair approaches
def loadJsonSafe(raw: str) -> Dict[str, Any] | None:
# This variable holds the cleaned string without carriage returns
cleaned = stripCodeFence(raw).replace("\r", "")
# This loop attempts different variants of the cleaned string for JSON decoding
for variant in (cleaned, removeTrailingCommas(cleaned)):
try:
return json.loads(variant)
except json.JSONDecodeError:
pass
# This block tries a relaxed approach using Python literal evaluation after replacements
try:
relaxed = cleaned.replace("null", "None").replace("true", "True").replace("false", "False")
return ast.literal_eval(relaxed)
......@@ -210,95 +179,151 @@ def loadJsonSafe(raw: str) -> Dict[str, Any] | None:
return None
#
# Diese Funktion validiert die GPT-Antwort und prüft, ob alle Pflichtschlüssel vorhanden sind.
# This function validates the GPT response to ensure all required keys exist
def validateResponse(raw: str) -> Tuple[bool, Dict[str, Any] | None]:
# This line loads the raw string into a Python dictionary
data = loadJsonSafe(raw)
# This condition checks if the result is invalid or not a dictionary
if data is None or not isinstance(data, dict):
print("VALIDATION: Antwort ist kein gültiges JSON-Objekt.")
return False, None
# This line finds any missing keys from the expected list
missing = [k for k in expectedKeys if k not in data]
# This condition checks if any keys are missing from the response
if missing:
print(f"VALIDATION: Fehlende Schlüssel: {', '.join(missing)}")
return False, None
# This line returns True if all checks passed
return True, data
#
# Dieser Block wird ausgeführt, wenn das Skript direkt gestartet wird.
# This block runs if the script is executed directly
if __name__ == "__main__":
# This variable holds the directory where PDF files are stored
cacheDir = "../cache"
# This line outputs information about the PDF file search location
print(f"INFO: Suche nach PDF-Dateien in '{cacheDir}' …")
# This condition checks if the cache directory exists
if not os.path.isdir(cacheDir):
print("FEHLER: Cache-Verzeichnis nicht gefunden.")
sys.exit(1)
# This list comprehension gathers all files with .pdf extension
pdfFiles = [f for f in os.listdir(cacheDir) if f.lower().endswith(".pdf")]
# This variable holds a set of tariff IDs derived from the PDF filenames
tariffIds: set[str] = set()
# This loop iterates over all found PDF files to identify possible tariff pairs
for f in pdfFiles:
stem = f[:-4].lower()
# This condition checks if the filename ends with _flyer
if stem.endswith("_flyer"):
tariffIds.add(stem[:-6])
# This condition checks if the filename ends with _pib
elif stem.endswith("_pib"):
tariffIds.add(stem[:-4])
# This condition checks if no matching PDF pairs were found
if not tariffIds:
print("INFO: Keine passenden PDF-Paare gefunden.")
sys.exit(0)
# This line creates an instance of the OpenAiManager
gptManager = OpenAiManager()
# This line creates a database session using MysqlManager
dbSession = MysqlManager().getSession()
# This loop processes each identified tariff ID
for tariffId in sorted(tariffIds):
print(f"\n--- Verarbeitung ID: {tariffId} ---")
# This query retrieves all BaseBase records matching the tariff ID
baseRecords = dbSession.query(BaseBase).filter_by(providercode_base=tariffId).all()
# This condition checks if no BaseBase records were found
if not baseRecords:
print("WARNUNG: Kein BaseBase-Datensatz gefunden – übersprungen.")
continue
# This condition checks if details_base is already filled for all rows
if all(br.details_base for br in baseRecords):
print("INFO: details_base bereits für alle Zeilen gefüllt – übersprungen.")
continue
# This line constructs the paths for the flyer and PIB PDF files
flyerPath = os.path.join(cacheDir, f"{tariffId}_flyer.pdf")
pibPath = os.path.join(cacheDir, f"{tariffId}_pib.pdf")
# This line extracts text from the flyer PDF
flyerText = extractTextFromPdf(flyerPath)
# This line extracts text from the PIB PDF
pibText = extractTextFromPdf(pibPath)
# This condition checks if any text is missing
if not flyerText or not pibText:
print("INFO: Fehlende Texte – übersprungen.")
continue
# This variable holds the full prompt to be sent to GPT by combining the templates and extracted text
fullPrompt = promptTemplate + "# Flyer-Text:\n" + flyerText + "\n" + "# PIB-Text:\n" + pibText
# This variable will store the validated data after GPT response
validatedData: Dict[str, Any] | None = None
# This loop tries up to three attempts to get a valid response from GPT
for attempt in range(1, 4):
print(f"INFO: GPT-Abfrage Versuch {attempt}/3 …")
# This block attempts to send the prompt to GPT
try:
raw = gptManager.chat(fullPrompt, model="gpt-4.1")
# This block catches any exceptions during the GPT request
except Exception as exc:
print(f"FEHLER: GPT-Abfrage fehlgeschlagen: {exc}")
raw = ""
# This line validates the GPT response
ok, parsed = validateResponse(raw)
# This condition checks if the response was valid
if ok:
validatedData = parsed
break
# This line indicates the response was invalid, prompting the next attempt
print("WARNUNG: Antwort ungültig – nächster Versuch …")
# This condition checks if no valid response was obtained after three attempts
if not validatedData:
print("FEHLER: Drei ungültige Antworten – übersprungen.")
continue
# This loop updates the details_base for each relevant BaseBase record
for br in baseRecords:
# This condition checks if details_base is None to set the validated data
if br.details_base is None:
br.details_base = validatedData
br.updated_base = _dt.datetime.now()
# This line commits the changes to the database
dbSession.commit()
print(f"INFO: JSON in {len([b for b in baseRecords if b.details_base])} Zeile(n) gespeichert.")
# This line closes the database session after processing
dbSession.close()
print("INFO: Verarbeitung abgeschlossen.")
import sys
# In diesem import wird der Pfad um eine Ebene nach oben erweitert.
sys.path.append("..")
# In diesem import werden Funktionen des Betriebssystems eingebunden.
import sys; sys.path.append("..")
import os
# In diesem import wird Funktionalität zum Lesen und Schreiben von CSV-Dateien eingebunden.
import csv
# In diesem import wird Funktionalität zum Arbeiten mit Datum und Uhrzeit eingebunden.
import datetime
# In diesem import wird Funktionalität zum Hinzufügen von Pausen im Code eingebunden.
import time
# In diesem import wird die Bibliothek für zeitbasierte Einmalpasswörter eingebunden.
import pyotp
# In diesem import wird Funktionalität für reguläre Ausdrücke eingebunden.
import re
# In diesem import wird Funktionalität für Base64-Codierung eingebunden.
import base64
# In diesem import wird eine Klasse für genaue Dezimalberechnungen eingebunden.
from decimal import Decimal
# In diesem import wird die Funktionalität zur Rückverfolgung von Fehlern eingebunden.
import traceback
# In diesem import wird Funktionalität zum Kopieren und Löschen von Dateien eingebunden.
import shutil
# In diesem import wird die Bibliothek zum Umgang mit Pandas-Datenstrukturen eingebunden.
import pandas as pd
# In diesem import wird Funktionalität für HTTP-Anfragen eingebunden.
import requests
# In diesem import wird die Bibliothek BeautifulSoup zum Parsen von HTML eingebunden.
from bs4 import BeautifulSoup
# In diesem import werden verschiedene Selektoren aus Selenium eingebunden.
from selenium.webdriver.common.by import By
# In diesem import werden Aktionen zum Simulieren von Mausbewegungen eingebunden.
from selenium.webdriver.common.action_chains import ActionChains
# In diesem import wird eine explizite Wartefunktion für Selenium geladen.
from selenium.webdriver.support.ui import WebDriverWait
# In diesem import wird eine Klasse zum Erstellen von Dropdown-Auswahlen eingebunden.
from selenium.webdriver.support.ui import Select
# In diesem import werden verschiedene Bedingungen für Selenium-Wartefunktionen eingebunden.
from selenium.webdriver.support import expected_conditions as EC
# In diesem import wird eine spezielle Ausnahme für Zeitüberschreitungen in Selenium geladen.
from selenium.webdriver.support.wait import TimeoutException
# In diesem import wird eine Ausnahme für nicht vorhandene Elemente in Selenium geladen.
from selenium.common.exceptions import NoSuchElementException
# In dieser import-Anweisung wird der SeleniumManager geladen.
from manager.SeleniumManager import SeleniumManager
# In dieser import-Anweisung werden Zugangsdaten aus der MauiConfig geladen.
from config.MauiConfig import MAUI_USERNAME, MAUI_PASSWORD, MAUI_AUTHCODE
# In dieser import-Anweisung wird ein MySQL-Manager zum Umgang mit Datenbanken geladen.
from manager.MysqlManager import MysqlManager
# In dieser import-Anweisung werden Modelle aus der System-Klasse geladen.
from models._system import Base
# In dieser import-Anweisung werden Basisklassen für Datenbankmodelle geladen.
from models.base_base import BaseBase
# In dieser import-Anweisung wird das Modell DealDeal eingebunden.
from models.deal_deal import DealDeal
# In dieser import-Anweisung wird das Modell OptionOpti eingebunden.
from models.option_opti import OptionOpti
# In dieser Variablen wird eine Menge gespeichert, um doppelte Kategorien zu vermeiden.
# This variable stores a unique set of category IDs to avoid duplicates.
uniqueCategorySet = set()
# In dieser Funktion wird eine PDF-Datei aus dem Selenium-Kontext als Base64 heruntergeladen und abgespeichert.
# This function downloads a PDF file as Base64 from the Selenium context and saves it.
def downloadPdfSelenium(seleniumDriver, pdfUrl, downloadFolder, fileName):
# In dieser Variablen wird ein Skript abgelegt, das als asynchroner Aufruf eine PDF-Datei anfordert und als Base64-String zurückliefert.
# This variable holds the asynchronous JavaScript code that requests the PDF as Base64.
downloadScript = """
var callback = arguments[arguments.length - 1];
var xhr = new XMLHttpRequest();
......@@ -113,204 +52,194 @@ def downloadPdfSelenium(seleniumDriver, pdfUrl, downloadFolder, fileName):
xhr.send();
"""
# In dieser Variablen wird das Ergebnis des ausgeführten Skripts als Base64-String gespeichert.
# This variable stores the Base64 string that the script returns.
pdfBase64String = seleniumDriver.execute_async_script(downloadScript, pdfUrl)
# In dieser if-Abzweigung wird geprüft, ob die Base64-Rückgabe korrekt ist.
# This if-structure checks if we have a valid Base64 result.
if not pdfBase64String:
# In diesem Zweig wird eine Ausnahme ausgelöst, wenn kein gültiger Base64-Inhalt vorliegt.
# This line raises an exception if the PDF download failed.
raise Exception("Der PDF-Download per Selenium ist fehlgeschlagen.")
# In dieser Variablen wird der vollständige Pfad für die zu speichernde PDF-Datei ermittelt.
# This variable defines the full path where the PDF will be saved.
destinationPath = os.path.join(downloadFolder, fileName)
# In diesem with-Block wird die Zieldatei erstellt und mit dem dekodierten Inhalt befüllt.
# This with-structure opens the file in write-binary mode and writes the decoded PDF data.
with open(destinationPath, "wb") as pdfFile:
# An dieser Stelle wird der Base64-String dekodiert und in das PDF geschrieben.
# This line decodes the Base64 data and writes it into the file.
pdfFile.write(base64.b64decode(pdfBase64String))
# In dieser Funktion wird der Login-Prozess über Selenium realisiert.
# This function performs the login process using Selenium.
def login(seleniumManager, userName, userPassword, rawToken):
# In dieser Variablen wird der Selenium-Driver nach dem Request zur Login-Seite gespeichert.
# This variable stores the Selenium driver after requesting the login page.
seleniumDriver = seleniumManager.simpleRequest("https://maui.md.de")
# In dieser Variablen wird ein WebDriverWait-Objekt angelegt, um Elemente abwarten zu können.
# This variable is a WebDriverWait object for waiting on elements to appear.
wait = WebDriverWait(seleniumDriver, 10)
# In dieser Variablen wird das Eingabefeld für den Benutzernamen gespeichert, nachdem es präsent ist.
# This variable holds the username field after it becomes present in the DOM.
usernameField = wait.until(EC.presence_of_element_located((By.ID, "mat-input-0")))
# In dieser Zeile wird der Benutzername in das Feld eingetragen.
# This line sends the username to the username field.
usernameField.send_keys(userName)
# In dieser Variablen wird das Eingabefeld für das Passwort gespeichert, nachdem es präsent ist.
# This variable holds the password field after it becomes present in the DOM.
userPasswordField = wait.until(EC.presence_of_element_located((By.ID, "mat-input-1")))
# In dieser Zeile wird das Passwort in das Feld eingetragen.
# This line sends the user password to the password field.
userPasswordField.send_keys(userPassword)
# In diesem Aufruf wird eine kurze Wartezeit eingefügt, um Stabilität zu gewährleisten.
# This function call adds a short delay to ensure stability.
time.sleep(1)
# In dieser Variablen wird der primäre Login-Button gespeichert, um ihn anschließend zu klicken.
# This variable holds the primary login button that will be clicked.
loginButtonElement = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[.//span[contains(text(),'Anmelden')]]")))
# In dieser Zeile wird der Login-Button betätigt.
# This line clicks the login button.
loginButtonElement.click()
# In dieser Variablen wird ein TOTP-Objekt erstellt, das mittels Raw Token einen Code generiert.
# This variable creates a TOTP object that uses the provided token to generate the 2FA code.
totpGenerator = pyotp.TOTP(rawToken)
# In dieser Variablen wird der aktuelle 2FA-Code aus dem TOTP-Objekt geholt.
# This variable holds the current 2FA code from the TOTP generator.
twoFactorCode = totpGenerator.now()
# In dieser Variablen wird das Eingabefeld für den 2FA-Code gespeichert, nachdem es präsent ist.
# This variable holds the field for entering the 2FA code after it becomes present.
twoFactorField = wait.until(EC.presence_of_element_located((By.ID, "mat-input-2")))
# In dieser Zeile wird der generierte 2FA-Code in das Feld geschrieben.
# This line inputs the 2FA code into the field.
twoFactorField.send_keys(twoFactorCode)
# In dieser Variablen wird der Anmelde-Button im Dialog gespeichert.
# This variable holds the 2FA modal login button after it becomes clickable.
modalLoginButtonElement = wait.until(EC.element_to_be_clickable((By.XPATH, "//mat-dialog-actions//button[span[contains(text(),'Anmelden')]]")))
# In dieser Zeile wird der Button per Skript geklickt, da er manchmal verdeckt sein kann.
# This line clicks the 2FA button through JavaScript in case it is obscured.
seleniumDriver.execute_script("arguments[0].click();", modalLoginButtonElement)
# In dieser Funktion wird nach erfolgreichem Login die Laufzeitvertrags-Seite aufgerufen.
# This function navigates to the Laufzeitvertrag page after a successful login.
def openLaufzeitvertrag(seleniumManager):
# In dieser Variablen wird ein WebDriverWait erzeugt, um gezielt Elemente abzufragen.
# This variable is a WebDriverWait for the driver's instance.
wait = WebDriverWait(seleniumManager.driver, 10)
# In dieser Variablen wird das Element mit dem Link zu Laufzeitvertrag gefunden.
# This variable locates the link element for Laufzeitvertrag after it becomes visible.
laufzeitElement = wait.until(EC.presence_of_element_located((By.XPATH, "//a[contains(text(),'Laufzeitvertrag')]")))
# In dieser Variablen wird der Link ausgelesen und anschließend geladen.
# This variable reads the href from the link and navigates the driver to it.
url = laufzeitElement.get_attribute("href")
seleniumManager.driver.get(url)
# In dieser Funktion wird geprüft, ob das Dropdown-Menü bereit ist, indem störende Overlays ausgeblendet sind.
# This function checks if a dropdown is ready by waiting for overlays to disappear.
def waitForDropdownReady(seleniumDriver, wait, maxRetries=3, retryDelay=5):
# In dieser for-Schleife werden mehrere Versuche unternommen, um das Dropdown verfügbar zu machen.
# This for-structure attempts multiple retries to ensure the dropdown becomes ready.
for attemptIndex in range(maxRetries):
# In diesem try-Block wird gezielt geprüft, ob bestimmte Overlays verschwunden sind.
# This try-structure checks for invisible overlays or iframes that block interaction.
try:
# In dieser Ausgabe wird informiert, welcher Versuch gerade stattfindet.
# This print call is a debug message showing which attempt is in progress.
print(f"DEBUG: Warte auf Dropdown-Bereitschaft (Versuch {attemptIndex + 1}/{maxRetries})...")
# In dieser Zeile wird bis zu 60 Sekunden gewartet, bis ein bestimmtes Iframe unsichtbar ist.
# This line waits up to 60 seconds for a specific iframe to become invisible.
WebDriverWait(seleniumDriver, 60).until(
EC.invisibility_of_element_located((By.XPATH, "//iframe[contains(@src, 'wait.html')]"))
)
# In dieser Zeile wird bis zu 60 Sekunden gewartet, bis ein bestimmtes Overlay unsichtbar ist.
# This line waits up to 60 seconds for a specific overlay to become invisible.
WebDriverWait(seleniumDriver, 60).until(
EC.invisibility_of_element_located((By.ID, "bg_layer"))
)
# In dieser Zeile wird bis zu 20 Sekunden gewartet, bis das Tarif-Dropdown im DOM auftaucht.
# This line waits up to 20 seconds for the tariff dropdown to appear in the DOM.
wait.until(EC.presence_of_element_located((By.NAME, "tarif_id")))
# In dieser Ausgabe wird mitgeteilt, dass das Dropdown bereit ist.
# This print call is a debug message confirming the dropdown is ready.
print(f"DEBUG: Dropdown ist bereit (Versuch {attemptIndex + 1}).")
return True
# In dieser except-Abzweigung wird geprüft, ob eine TimeoutException vorliegt.
# This except-structure handles a TimeoutException if the overlay remains visible too long.
except TimeoutException:
# Hier wird eine Warnung ausgegeben, dass der aktuelle Versuch erfolglos war.
# This print call is a debug message warning that the attempt timed out.
print(f"DEBUG: Warnung: Timeout beim Warten auf Dropdown-Bereitschaft (Versuch {attemptIndex + 1}/{maxRetries}).")
# In dieser if-Abzweigung wird geprüft, ob noch ein weiterer Versuch erfolgen soll.
# This if-structure checks if more attempts are allowed.
if attemptIndex < maxRetries - 1:
time.sleep(retryDelay)
else:
print(f"DEBUG: FEHLER: Konnte nach {maxRetries} Versuchen nicht auf Dropdown-Bereitschaft warten.")
break
# In dieser except-Abzweigung werden unvorhergesehene Fehler protokolliert.
# This except-structure captures any unexpected errors during the wait process.
except Exception as exception:
print(f"DEBUG: Unerwarteter Fehler beim Warten auf Dropdown (Versuch {attemptIndex + 1}/{maxRetries}): {exception}")
# In dieser if-Abzweigung wird geprüft, ob noch weitere Versuche unternommen werden.
# This if-structure checks if more attempts are allowed.
if attemptIndex < maxRetries - 1:
time.sleep(retryDelay)
else:
print(f"DEBUG: FEHLER: Konnte nach {maxRetries} Versuchen wegen unerwartetem Fehler nicht auf Dropdown warten.")
break
# In dieser Zeile wird False zurückgegeben, falls alle Versuche scheitern.
# This line returns False if all retries have failed.
return False
# In dieser Funktion wird der aktuell im Frontend angezeigte Tarifpreis abgerufen und der Nettopreis berechnet.
# This function fetches the currently displayed tariff price and calculates the net price.
def parsePlanPrice(seleniumDriver):
# In dieser Variablen wird ein Initialwert für den Nettopreis gesetzt.
# This variable initializes the net price to 0.0.
priceNet = 0.0
# In diesem try-Block wird versucht, den Wert aus dem Element preis_anzeige_tarif auszulesen.
# This try-structure attempts to read the tariff price from preis_anzeige_tarif.
try:
priceElement = seleniumDriver.find_element(By.ID, "preis_anzeige_tarif")
priceText = priceElement.text
priceMatch = re.search(r'([\d\.,]+)\s*EUR', priceText)
# In dieser if-Abzweigung wird geprüft, ob der reguläre Ausdruck einen Treffer hatte.
# This if-structure checks if the regular expression found a match.
if priceMatch:
rawStr = priceMatch.group(1).replace(".", "").replace(",", ".")
grossPrice = float(rawStr)
priceNet = round(grossPrice / 1.19, 5)
# In dieser except-Abzweigung wird ein Hinweis ausgegeben, falls kein Preis ermittelt werden konnte.
# This except-structure handles any errors if the price cannot be extracted.
except Exception as exception:
print(f"DEBUG: Warnung: Konnte Tarifpreis nicht extrahieren: {exception}")
# In dieser Zeile wird der ermittelte Nettopreis zurückgegeben.
# This line returns the net price.
return priceNet
# In dieser Funktion werden Kampagnen aus dem entsprechenden Dropdown geparst und als Liste zurückgegeben.
# This function parses available campaigns from the corresponding dropdown.
def parseCampaigns(seleniumDriver):
# In dieser Liste werden alle gefundenen Kampagnen-Tuples gespeichert.
# This variable is a list that collects all found campaigns as tuples.
campaignsList = []
# In diesem try-Block wird versucht, das Kampagnen-Select und dessen Optionen zu finden.
# This try-structure attempts to locate and parse the campaign select element.
try:
campaignSelect = seleniumDriver.find_element(By.NAME, "am_aktion_select")
campaignOptions = campaignSelect.find_elements(By.TAG_NAME, "option")
# In dieser for-Schleife werden die Attribute jeder Option ausgelesen und gefiltert.
# This for-structure iterates over all option elements in the campaign select.
for copt in campaignOptions:
val = copt.get_attribute("value")
txt = copt.text.strip()
# In dieser if-Abzweigung werden ungültige oder leere Werte übersprungen.
# This if-structure skips invalid or empty values.
if not val or val in [" |", "-1|", "|", "-1|", "0|"]:
continue
partsVal = val.split("|")
campaignId = partsVal[0].strip()
# In dieser if-Abzweigung wird geprüft, ob eine Kampagnen-ID extrahiert werden konnte.
# This if-structure checks if a valid campaign ID was extracted.
if not campaignId:
continue
# In dieser if-Abzweigung wird geprüft, ob ein Trennstrich im Text enthalten ist.
# This if-structure checks if the text has a dash that splits the campaign name.
if "-" in txt:
splitted = txt.split("-", 1)
campaignName = splitted[1].strip()
......@@ -319,87 +248,97 @@ def parseCampaigns(seleniumDriver):
campaignsList.append((campaignId, campaignName))
# In dieser except-Abzweigung wird ein Hinweis ausgegeben, falls das Element nicht gefunden werden konnte.
# This except-structure handles any error if the campaign select is not found.
except Exception as exception:
print(f"DEBUG: Warnung: Konnte Kampagnen nicht extrahieren: {exception}")
# In dieser Zeile wird die Liste der gefundenen Kampagnen zurückgegeben.
# This line returns the list of found campaigns.
return campaignsList
# In dieser Funktion werden die Hauptdaten gescraped und in verschiedene CSV-Dateien geschrieben.
# This function scrapes the main data and writes it to various CSV files.
def scrapeData(seleniumManager):
# In dieser Variablen wird der Selenium-Driver abgelegt.
# This variable holds the Selenium driver instance.
seleniumDriver = seleniumManager.driver
# In dieser Variablen wird ein WebDriverWait mit Timeout 20 Sekunden abgelegt.
# This variable is a WebDriverWait object with a 20-second timeout.
wait = WebDriverWait(seleniumDriver, 20)
# In dieser Variablen wird ein Pfad für das Cache-Verzeichnis definiert.
# This variable defines the path for the cache directory.
cacheDir = "../cache"
# In dieser if-Abzweigung wird geprüft, ob das Verzeichnis bereits existiert.
# This if-structure checks if the cache directory already exists.
if os.path.exists(cacheDir):
# In diesem try-Block wird versucht, das bestehende Verzeichnis zu löschen.
# This try-structure attempts to remove the existing directory if present.
try:
shutil.rmtree(cacheDir)
print(f"Info: Bestehendes Cache-Verzeichnis '{cacheDir}' wurde gelöscht.")
except OSError as exception:
print(f"Fehler beim Löschen von Verzeichnis {cacheDir}: {exception}")
# In diesem try-Block wird das Verzeichnis neu erstellt oder sichergestellt, dass es existiert.
# This try-structure ensures that the cache directory is created.
try:
os.makedirs(cacheDir, exist_ok=True)
print(f"Info: Cache-Verzeichnis '{cacheDir}' sichergestellt/neu erstellt.")
except OSError as exception:
print(f"Fehler beim Erstellen von Verzeichnis {cacheDir}: {exception}")
# In diesen Variablen werden die Pfade zu den einzelnen CSV-Dateien definiert.
# This variable stores the path for the plans.csv file.
plansCsvFilePath = os.path.join(cacheDir, "plans.csv")
# This variable stores the path for the campaigns.csv file.
campaignsCsvFilePath = os.path.join(cacheDir, "campaigns.csv")
# This variable stores the path for the options.csv file.
optionsCsvFilePath = os.path.join(cacheDir, "options.csv")
# This variable stores the path for the categorys.csv file.
categorysCsvFilePath = os.path.join(cacheDir, "categorys.csv")
# In dieser Menge werden bereits geschriebene Tarife gespeichert, um Duplikate zu vermeiden.
# This variable is a set used to track which tariffs have been written to avoid duplicates.
writtenPlanIdSet = set()
# In dieser Liste werden die möglichen Rahmenvertragsnummern abgelegt.
# This variable stores a list of possible framework contract numbers.
frameworkList = ["", 980066161, 980008940, 981000541]
# In diesem with-Block werden alle CSV-Dateien geöffnet und die Writer initialisiert.
# This with-structure opens all CSV files and prepares the writers.
with open(plansCsvFilePath, mode="w", newline="", encoding="utf-8") as plansFile, \
open(campaignsCsvFilePath, mode="w", newline="", encoding="utf-8") as campaignsFile, \
open(optionsCsvFilePath, mode="w", newline="", encoding="utf-8") as optionsFile, \
open(categorysCsvFilePath, mode="w", newline="", encoding="utf-8") as categorysFile:
# In diesen Variablen werden die CSV-Writer für jede Datei angelegt.
# This variable is the CSV writer for plans.csv.
plansWriter = csv.writer(plansFile, delimiter=";")
# This variable is the CSV writer for campaigns.csv.
campaignsWriter = csv.writer(campaignsFile, delimiter=";")
# This variable is the CSV writer for options.csv.
optionsWriter = csv.writer(optionsFile, delimiter=";")
# This variable is the CSV writer for categorys.csv.
categorysWriter = csv.writer(categorysFile, delimiter=";")
# In dieser Zeile werden die Spaltenüberschriften für die plans.csv geschrieben.
# This line writes the header row for plans.csv.
plansWriter.writerow(["id", "provider", "network", "name", "price", "rahmen"])
# In dieser Zeile werden die Spaltenüberschriften für die campaigns.csv geschrieben.
# This line writes the header row for campaigns.csv.
campaignsWriter.writerow(["id", "plan", "name"])
# In dieser Zeile werden die Spaltenüberschriften für die options.csv geschrieben.
# This line writes the header row for options.csv.
optionsWriter.writerow(["id", "category", "plan", "name", "price"])
# In dieser Zeile werden die Spaltenüberschriften für die categorys.csv geschrieben.
# This line writes the header row for categorys.csv.
categorysWriter.writerow(["id", "name"])
# In dieser for-Schleife werden alle Rahmenvertragsnummern durchlaufen.
# This for-structure iterates over each framework contract option.
for currentFramework in frameworkList:
# In dieser if-Abzweigung wird geprüft, ob wir eine Nummer im Rahmenfeld setzen müssen.
# This if-structure checks if we must set a framework number.
if currentFramework != "":
# In diesem try-Block wird die Checkbox für Rahmenvertrag angeklickt.
# This try-structure attempts to click the framework checkbox.
try:
wait.until(EC.element_to_be_clickable((By.NAME, "rv_option")))
rvCheckbox = seleniumDriver.find_element(By.NAME, "rv_option")
......@@ -408,7 +347,7 @@ def scrapeData(seleniumManager):
except Exception as exception:
print(f"DEBUG: Konnte Checkbox 'rv_option' nicht setzen: {exception}")
# In diesem try-Block wird das Eingabefeld für die Rahmenvertragsnummer gesetzt.
# This try-structure attempts to fill in the framework number field.
try:
wait.until(EC.presence_of_element_located((By.NAME, "rv_nr")))
rvNrField = seleniumDriver.find_element(By.NAME, "rv_nr")
......@@ -420,17 +359,18 @@ def scrapeData(seleniumManager):
else:
print("emptyRahmen")
# In dieser Zeile wird eine Wartezeit von 5 Sekunden eingefügt.
# This function call adds a delay of 5 seconds before further steps.
time.sleep(5)
# In diesen Variablen werden die aktuellen Tarifwelten und Netze neu ausgelesen.
# This variable stores the list of available tariff worlds by reading the elements.
tarifWeltElements = seleniumDriver.find_elements(By.NAME, "tarif_welt")
tarifWelten = [elem.get_attribute("value") for elem in tarifWeltElements if elem.get_attribute("value")]
# This variable stores the list of available networks by reading the elements.
netzElements = seleniumDriver.find_elements(By.NAME, "netz")
netzList = [elem.get_attribute("value") for elem in netzElements if elem.get_attribute("value")]
# In diesem try-Block wird der Radio-Button für die Produktkategorie 'A' geklickt.
# This try-structure attempts to select the product category 'A'.
try:
productCategoryElement = wait.until(EC.element_to_be_clickable((By.XPATH, '//input[@name="sel_produkt_kategorie" and @value="A"]')))
seleniumDriver.execute_script("arguments[0].click();", productCategoryElement)
......@@ -439,15 +379,15 @@ def scrapeData(seleniumManager):
traceback.print_exc()
continue
# In dieser for-Schleife werden alle gefundenen Tarifwelten durchlaufen.
# This for-structure iterates over each available tariff world.
for tarifWelt in tarifWelten:
# In dieser if-Abzweigung wird geprüft, ob das Dropdown bereit ist.
# This if-structure checks if the dropdown is ready.
if not waitForDropdownReady(seleniumDriver, wait):
print(f"DEBUG: Überspringe Tarifwelt {tarifWelt}, da die Seite nicht rechtzeitig bereit war.")
continue
# In diesem try-Block wird der jeweilige Radio-Button für die Tarifwelt geklickt.
# This try-structure attempts to click the radio button for the current tariff world.
try:
tarifWeltRadio = wait.until(EC.element_to_be_clickable((By.XPATH, f'//input[@name="tarif_welt" and @value="{tarifWelt}"]')))
seleniumDriver.execute_script("arguments[0].click();", tarifWeltRadio)
......@@ -456,15 +396,15 @@ def scrapeData(seleniumManager):
traceback.print_exc()
continue
# In dieser for-Schleife werden alle möglichen Netze durchlaufen.
# This for-structure iterates over each possible network.
for net in netzList:
# In dieser if-Abzweigung wird nochmals geprüft, ob das Dropdown bereit ist.
# This if-structure checks if the dropdown is still ready.
if not waitForDropdownReady(seleniumDriver, wait):
print(f"DEBUG: Überspringe Netz {net} in Tarifwelt {tarifWelt}, da die Seite nicht rechtzeitig bereit war.")
continue
# In diesem try-Block wird das jeweilige Netz geklickt.
# This try-structure attempts to click the radio button for the network.
try:
netRadio = wait.until(EC.element_to_be_clickable((By.XPATH, f'//input[@name="netz" and @value="{net}"]')))
seleniumDriver.execute_script("arguments[0].click();", netRadio)
......@@ -473,12 +413,12 @@ def scrapeData(seleniumManager):
traceback.print_exc()
continue
# In dieser if-Abzweigung wird noch einmal die Verfügbarkeit des Tarif-Dropdowns geprüft.
# This if-structure checks again if the tariff dropdown is ready.
if not waitForDropdownReady(seleniumDriver, wait):
print(f"DEBUG: Überspringe Netz {net} in Tarifwelt {tarifWelt}, da Tarif-Dropdown nicht bereit war.")
continue
# In diesem try-Block werden alle Tarifoptionen für das gegebene Netz ausgelesen.
# This try-structure collects all tariff options from the dropdown for the current network.
try:
dropdown = wait.until(EC.presence_of_element_located((By.NAME, "tarif_id")))
selectObj = Select(dropdown)
......@@ -488,26 +428,26 @@ def scrapeData(seleniumManager):
traceback.print_exc()
continue
# In dieser for-Schleife werden alle Tarife aus dem Dropdown verarbeitet.
# This for-structure iterates over each tariff in the dropdown.
for tariffId, optText in optionsToProcess:
# In dieser if-Abzweigung werden Platzhalterwerte übersprungen.
# This if-structure skips placeholder text and empty IDs.
if optText in ["Bitte wählen Sie aus...", ""] or not tariffId:
continue
# In dieser if-Abzweigung wird geprüft, ob der Tarif für diesen Rahmen schon erfasst wurde.
# This if-structure checks if the tariff is already written for the current framework.
if (tariffId, currentFramework) in writtenPlanIdSet:
print(f"DEBUG: Tarif {tariffId} für Rahmen {currentFramework} bereits in CSV, überspringe.")
continue
print(f"DEBUG: Verarbeite: {tariffId} - {net} - {optText} (Rahmen {currentFramework})")
# In dieser if-Abzweigung wird geprüft, ob das Dropdown weiterhin verfügbar ist.
# This if-structure checks the dropdown readiness again before proceeding.
if not waitForDropdownReady(seleniumDriver, wait):
print(f"DEBUG: Überspringe Tarif {tariffId} ({optText}), da die Seite nicht rechtzeitig bereit war.")
continue
# In diesem try-Block wird der passende Tarif im Dropdown gewählt.
# This try-structure selects the appropriate tariff in the dropdown.
try:
currentDropdown = wait.until(EC.presence_of_element_located((By.NAME, "tarif_id")))
currentSelectObj = Select(currentDropdown)
......@@ -524,23 +464,23 @@ def scrapeData(seleniumManager):
traceback.print_exc()
continue
# In diesem try-Block wird gewartet, bis das Overlay verschwindet.
# This try-structure waits for the overlay to become invisible after the selection.
try:
WebDriverWait(seleniumDriver, timeout=60).until(EC.invisibility_of_element_located((By.ID, "bg_layer")))
except TimeoutException:
print(f"DEBUG: FEHLER: Timeout beim Warten auf bg_layer nach Auswahl von Tarif {tariffId}. Überspringe...")
continue
# In dieser Zeile wird kurz gewartet, um die Preisanzeige stabil zu laden.
# This line waits briefly to stabilize the price display.
time.sleep(1.5)
# In dieser Variablen wird der Nettopreis des aktuell ausgewählten Tarifs erfasst.
# This variable holds the net price of the currently selected tariff.
planPriceNet = parsePlanPrice(seleniumDriver)
# In dieser Variablen wird die Liste aller verfügbaren Kampagnen erfasst.
# This variable stores the list of all available campaigns for the tariff.
campaigns = parseCampaigns(seleniumDriver)
# In dieser Zeile wird der Tarif in die plans.csv geschrieben.
# This line writes the tariff record into plans.csv.
plansWriter.writerow([
tariffId,
tarifWelt,
......@@ -550,10 +490,10 @@ def scrapeData(seleniumManager):
currentFramework
])
# In dieser Zeile wird der Tarif als bereits erfasst markiert.
# This line marks the tariff as written to avoid duplication.
writtenPlanIdSet.add((tariffId, currentFramework))
# In dieser for-Schleife werden alle Kampagnen in die campaigns.csv geschrieben.
# This for-structure writes each campaign to the campaigns.csv file.
for (campId, campName) in campaigns:
campaignsWriter.writerow([
campId,
......@@ -561,21 +501,23 @@ def scrapeData(seleniumManager):
campName
])
# In diesen Variablen werden die URLs für PDF-Dokumente abgeleitet.
# This variable constructs the URL for the flyer PDF.
flyerPdfUrl = f"https://maui.mobilcom.de/vertragserfassung/show_pib_flyer.php?variant_id={tariffId}"
# This variable constructs the URL for the PIB PDF.
pibPdfUrl = flyerPdfUrl + "&pib"
# In diesem try-Block werden die PDFs heruntergeladen.
# This try-structure attempts to download the PDF documents.
try:
downloadPdfSelenium(seleniumDriver, flyerPdfUrl, cacheDir, f"{tariffId}_flyer.pdf")
downloadPdfSelenium(seleniumDriver, pibPdfUrl, cacheDir, f"{tariffId}_pib.pdf")
except Exception as exception:
print(f"DEBUG: Fehler beim PDF-Download für Tarif {tariffId}: {exception}")
# In dieser Variablen wird gespeichert, ob zur Optionsseite navigiert werden konnte.
# This variable will store whether navigation to the options page succeeded.
navigationToOptionsSuccessful = False
# In diesem try-Block wird versucht, zur Optionsseite zu navigieren.
# This try-structure attempts to navigate to the options page.
try:
print(f"DEBUG: Versuche zur Optionsseite zu navigieren für Tarif {tariffId}...")
wait.until(EC.presence_of_element_located((By.NAME, "mobildaten")))
......@@ -588,7 +530,7 @@ def scrapeData(seleniumManager):
traceback.print_exc()
continue
# In dieser if-Abzweigung wird geprüft, ob die Navigation erfolgreich war.
# This if-structure proceeds only if navigation to the options page was successful.
if navigationToOptionsSuccessful:
try:
print(f"DEBUG: Rufe scrapeOption für Tarif {tariffId} auf.")
......@@ -602,58 +544,56 @@ def scrapeData(seleniumManager):
print(f"DEBUG: Fehler während scrapeOption für Tarif {tariffId}: {exception}")
traceback.print_exc()
# In dieser Zeile werden die CSV-Dateien nach jedem Tarif zwischengespeichert.
# This print call notifies that CSV files are flushed after processing each tariff.
print(f"DEBUG: Flushe CSV-Dateien nach Verarbeitung von Tarif {tariffId}.")
plansFile.flush()
campaignsFile.flush()
optionsFile.flush()
categorysFile.flush()
# In dieser Funktion werden die Optionsdaten eines Tarifs auf der Optionsseite extrahiert.
# This function scrapes the options for a given tariff on the options page.
def scrapeOption(seleniumManager, tariffId, optionsWriter, categorysWriter):
# In dieser Debug-Ausgabe wird mitgeteilt, dass das Scraping der Option gestartet wurde.
# This print call is a debug message indicating the start of scraping options.
print(f"DEBUG: scrapeOption gestartet für Tarif {tariffId}.")
# In dieser Variablen wird auf den im SeleniumManager gespeicherten Driver zugegriffen.
# This variable references the Selenium driver from the SeleniumManager.
seleniumDriver = seleniumManager.driver
# In dieser Variablen wird ein WebDriverWait-Objekt mit 20 Sekunden Timeout angelegt.
# This variable is a WebDriverWait with a 20-second timeout.
wait = WebDriverWait(seleniumDriver, 20)
# In diesem try-Block findet das eigentliche Parsing der Optionsseite statt.
# This try-structure attempts to parse the options page content.
try:
# In dieser Zeile wird bis zu 60 Sekunden auf das Verschwinden eines Overlays gewartet.
# This print call indicates waiting for the overlay to be invisible.
print(f"DEBUG: Warte auf Unsichtbarkeit von bg_layer für Tarif {tariffId}.")
WebDriverWait(seleniumDriver, timeout=60).until(EC.invisibility_of_element_located((By.ID, "bg_layer")))
# In dieser Zeile wird darauf gewartet, dass das Formular 'tarifoptionen' im DOM vorhanden ist.
# This print call indicates waiting for the 'tarifoptionen' form to be in the DOM.
print(f"DEBUG: Warte auf Formular 'tarifoptionen' für Tarif {tariffId}.")
wait.until(EC.presence_of_element_located((By.NAME, "tarifoptionen")))
# In dieser Zeile wird geprüft, ob mindestens eine Tabelle mit Klasse 'tb_back' vorhanden ist.
# This print call indicates waiting for at least one table with class 'tb_back' to appear.
print(f"DEBUG: Warte auf Klasse 'tb_back' für Tarif {tariffId}.")
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "tb_back")))
print(f"DEBUG: Optionsseite für Tarif {tariffId} scheint geladen zu sein.")
# In dieser Variablen wird der komplette HTML-Inhalt gespeichert.
# This variable stores the entire HTML content of the current page.
htmlContent = seleniumDriver.page_source
# In dieser Variablen wird ein BeautifulSoup-Objekt erstellt, um den Inhalt zu parsen.
# This variable is a BeautifulSoup object for parsing the page content.
soupObj = BeautifulSoup(htmlContent, "html.parser")
print(f"DEBUG: Rufe parse_options für Tarif {tariffId} auf.")
optionsData, categoryData = parseOptions(soupObj)
print(f"DEBUG: parse_options fand {len(optionsData)} Optionen und {len(categoryData)} Kategorien für Tarif {tariffId}.")
# In dieser Variablen wird gezählt, wie viele neue Kategorien geschrieben werden.
# This variable tracks how many new categories are written to the file.
catsWritten = 0
# In dieser for-Schleife werden alle Kategorien verarbeitet.
# This for-structure processes each category line found.
for catLine in categoryData:
parts = catLine.split(";", 1)
if len(parts) == 2:
......@@ -662,16 +602,18 @@ def scrapeOption(seleniumManager, tariffId, optionsWriter, categorysWriter):
global uniqueCategorySet
if catId not in uniqueCategorySet:
# This line writes the category data to categorys.csv.
categorysWriter.writerow([catId, catName])
uniqueCategorySet.add(catId)
catsWritten += 1
print(f"DEBUG: {catsWritten} neue Kategorien in CSV geschrieben für Tarif {tariffId}.")
# In dieser Variablen wird gezählt, wie viele Optionen geschrieben werden.
# This variable tracks how many options are written to the file.
optsWritten = 0
# In dieser for-Schleife werden die ermittelten Optionszeilen verarbeitet.
# This for-structure processes each options line found.
for line in optionsData:
parts = line.split(";", 3)
if len(parts) == 4:
......@@ -680,15 +622,16 @@ def scrapeOption(seleniumManager, tariffId, optionsWriter, categorysWriter):
itemName = parts[2]
priceStr = parts[3]
# In diesem try-Block wird der Preis als float konvertiert.
# This try-structure converts the price to float.
try:
grossPrice = float(priceStr)
except ValueError:
grossPrice = 0.0
# In dieser Zeile wird der Nettopreis auf Basis von 19% MwSt. berechnet.
# This variable calculates the net price from the gross price using 19% tax.
netPrice = round(grossPrice / 1.19, 5)
# This line writes the option data to the options.csv file.
optionsWriter.writerow([
itemId,
categoryRefId,
......@@ -700,20 +643,24 @@ def scrapeOption(seleniumManager, tariffId, optionsWriter, categorysWriter):
print(f"DEBUG: {optsWritten} Optionen in CSV geschrieben für Tarif {tariffId}.")
# In dieser except-Abzweigung werden Fehler während des Parsings protokolliert.
# This except-structure logs any errors during parsing or writing.
except Exception as exception:
print(f"FEHLER in scrapeOption (Parsing/Writing) für Tarif {tariffId}: {exception}")
traceback.print_exc()
# In diesem finally-Block wird versucht, auf die Hauptseite (Mobildaten) zurück zu navigieren.
# This finally-structure attempts to navigate back to the main page after processing options.
finally:
# In diesen Variablen werden die Anzahl der Versuche und die Pausenzeit definiert.
# This variable sets how many navigation retries are allowed.
maxRetriesNav = 2
# This variable sets the delay between navigation retries.
retryDelayNav = 3
# This variable indicates whether the return navigation was successful.
backNavSuccessful = False
# In dieser for-Schleife werden mehrere Versuche zum Rücksprung in die Hauptseite durchgeführt.
# This for-structure makes several attempts to return to the main page.
for attempt in range(maxRetriesNav):
try:
print(f"DEBUG: Versuche zurückzunavigieren von Optionsseite für Tarif {tariffId} (Versuch {attempt + 1}/{maxRetriesNav}).")
......@@ -732,47 +679,45 @@ def scrapeOption(seleniumManager, tariffId, optionsWriter, categorysWriter):
else:
print(f"DEBUG: Endgültige Warnung: Konnte nach {maxRetriesNav} Versuchen nicht von Tarif {tariffId} zurücknavigieren.")
# In dieser if-Abfrage wird protokolliert, falls die Rücknavigation nicht geklappt hat.
# This if-structure logs a message if navigation back to the main page was not successful.
if not backNavSuccessful:
print(f"DEBUG: Rücknavigation von Tarif {tariffId} war nicht erfolgreich. Fortsetzung kann instabil sein.")
# In dieser Funktion werden die Optionen und Kategorien im HTML-Dokument geparst und aufbereitet.
# This function parses the HTML for options and categories on the options page.
def parseOptions(soupObj):
# In dieser Liste werden alle gefundenen Optionen gespeichert.
# This variable is a list for storing all discovered options.
optionsResults = []
# In dieser Liste werden alle gefundenen Kategorien gespeichert.
# This variable is a list for storing all discovered categories.
categoryResults = []
# In diesem Set werden Kategorien gesammelt, die schon hinzugefügt wurden, um Duplikate zu vermeiden.
# This variable is a set to track category IDs that have already been added.
collectedCategoryIds = set()
# In dieser Variablen wird ein RegEx für die Prüfung von Gruppenfeldern definiert.
# This variable is a regex for identifying group check inputs.
categoryCheckPattern = re.compile(r'service_code\[(G\d+)_check\]')
# In dieser Variablen wird ein RegEx für versteckte Gruppenfelder definiert.
# This variable is a regex for identifying hidden group inputs.
categoryHiddenPattern = re.compile(r'service_code\[(G\d+)_check\]')
# In dieser Variablen wird ein RegEx für Radio-Buttons in Gruppenfeldern definiert.
# This variable is a regex for identifying radio inputs in group fields.
categoryRadioPattern = re.compile(r'service_code\[(G\d+)\]')
# In dieser Variablen wird ein RegEx für Item-IDs definiert, die mit G oder O beginnen.
# This variable is a regex for matching item IDs starting with G or O.
itemValuePattern = re.compile(r'^(G\d+|O\d+)$')
# In dieser Variablen wird ein RegEx für Preise definiert, um Beträge im Text zu erkennen.
# This variable is a regex for matching monthly prices in the text.
pricePattern = re.compile(r'/\s*€\s*([\d.,]+)\s*monatlich', re.IGNORECASE)
# In dieser Variablen wird ein RegEx definiert, um Sub-Selects zu erkennen.
# This variable is a regex for sub-select fields within a group.
subSelectPattern = re.compile(r"service_code\[(G\d+)_S\d+\]")
# In dieser Variablen werden alle Haupttabellen mit Klasse 'tb_back' gesucht.
# This variable finds all main tables with class 'tb_back'.
allPotentialMainTables = soupObj.find_all("table", class_="tb_back")
print(f"DEBUG: parse_options: {len(allPotentialMainTables)} potenzielle Haupttabellen (tb_back) gefunden.")
# In dieser for-Schleife wird jede gefundene Tabelle untersucht.
# This for-structure processes each found main table.
for tbl in allPotentialMainTables:
catNameEl = tbl.find("td", class_="tb_head")
......@@ -790,7 +735,7 @@ def parseOptions(soupObj):
catInputHidden = tbl.find("input", type="hidden", attrs={"name": categoryHiddenPattern})
catInputRadio = tbl.find("input", type="radio", attrs={"name": categoryRadioPattern})
# In dieser if-Abfolge wird geprüft, welche Kategorie-ID wir aus welcher Input-Variante ziehen können.
# This if-structure checks which type of input can provide the category ID.
if catInputCheck:
matchCheck = categoryCheckPattern.search(catInputCheck.get("name", ""))
if matchCheck:
......@@ -804,30 +749,30 @@ def parseOptions(soupObj):
if matchRadio:
categoryId = matchRadio.group(1)
# In dieser if-Abzweigung werden irrelevante oder unbekannte Gruppen ausgeschlossen.
# This if-structure excludes irrelevant or unknown groups.
if not categoryId or catText in ["Sonstige Angaben", "Pflicht-Angaben"]:
continue
print(f"DEBUG: Verarbeite Optionsgruppe: {categoryId} - {catText}")
# In dieser if-Abzweigung wird die Kategorie einmalig in die categoryResults aufgenommen.
# This if-structure adds a new category to categoryResults if it has not been added yet.
if categoryId not in collectedCategoryIds:
categoryResults.append(f"{categoryId};{catText}")
collectedCategoryIds.add(categoryId)
# In dieser Variablen werden mögliche Untertabellen gesucht.
# This variable finds potential sub-tables within the main table.
subTables = tbl.find_all("table", {"border": "0", "width": "520", "cellspacing": "0", "cellpadding": "4"})
if not subTables:
subTables = [tbl]
lastGId = None
# In dieser for-Schleife werden die Untertabellen untersucht.
# This for-structure processes each sub-table to find inputs and sub-selects.
for subTbl in subTables:
inp = subTbl.find("input", attrs={"value": itemValuePattern})
subSelect = subTbl.find("select", attrs={"name": subSelectPattern})
# In dieser if-Abzweigung wird geprüft, ob ein passendes Input-Feld gefunden wurde.
# This if-structure checks if a matching input field was found.
if inp:
itemId = inp.get("value", "").strip()
if not itemId:
......@@ -839,7 +784,7 @@ def parseOptions(soupObj):
itemName = "Unbekannt"
# In dieser if-Abzweigung wird der Text des Label-Tags als Name verwendet, falls vorhanden.
# This if-structure attempts to extract the label text if it exists.
if itemLabelTag and itemLabelTag.text.strip():
itemName = re.sub(r'\s+', ' ', itemLabelTag.text.strip())
else:
......@@ -849,7 +794,7 @@ def parseOptions(soupObj):
if linkInDiv and linkInDiv.text.strip():
itemName = re.sub(r'\s+', ' ', linkInDiv.text.strip())
# In dieser if-Abzweigung wird fortgefahren, wenn kein Name ermittelt werden kann.
# This if-structure continues if no valid name is found.
if itemName == "Unbekannt":
continue
......@@ -857,7 +802,7 @@ def parseOptions(soupObj):
mPrice = pricePattern.search(combinedText)
priceStr = "0.0"
# In dieser if-Abzweigung wird ein gefundener Preis verarbeitet.
# This if-structure checks if a price was found in the text.
if mPrice:
rawPrice = mPrice.group(1)
normalized = rawPrice.replace(".", "").replace(",", ".")
......@@ -867,7 +812,7 @@ def parseOptions(soupObj):
except ValueError:
priceStr = "0.0"
# In dieser if-Abzweigung wird unterschieden, ob wir eine Gruppen-ID oder eine normale Option haben.
# This if-structure differentiates between group items and normal options.
if itemId.startswith("G"):
optionsResults.append(f"{categoryId};{itemId};{itemName};{priceStr}")
lastGId = itemId
......@@ -878,7 +823,7 @@ def parseOptions(soupObj):
optionsResults.append(f"{categoryId};{itemId};{itemName};{priceStr}")
lastGId = None
# In dieser if-Abzweigung wird geprüft, ob wir ein Sub-Select haben und zuletzt eine Gruppen-ID gespeichert wurde.
# This if-structure handles sub-select elements if we have a stored group ID.
if subSelect and lastGId:
optionTags = subSelect.find_all("option", attrs={"value": re.compile(r"^O\d+$")})
for optTag in optionTags:
......@@ -908,47 +853,45 @@ def parseOptions(soupObj):
lastGId = None
# In dieser Liste werden doppelte Einträge entfernt.
# This variable deduplicates the options list.
uniqueOptions = list(set(optionsResults))
# This variable deduplicates the categories list.
uniqueCategoriesList = list(set(categoryResults))
print(f"DEBUG: parse_options: Gibt {len(uniqueOptions)} eindeutige Optionen und {len(uniqueCategoriesList)} eindeutige Kategorien zurück.")
return uniqueOptions, uniqueCategoriesList
# In dieser Funktion wird geprüft, ob eine bestimmte Gruppen-ID ein Sub-Select besitzt.
# This function checks if a given group ID has a sub-select in the provided list of selects.
def hasSubSelectForId(gId, subSelects):
# In dieser for-Schleife werden alle Select-Elemente geprüft.
# This for-structure inspects each select element to see if it matches the group ID.
for s in subSelects:
sName = s.get("name", "")
if gId in sName:
return True
return False
# In dieser Variablen wird ein SeleniumManager mit sichtbarem Browserfenster und gegebenem Geckodriver-Pfad initialisiert.
# This variable initializes a SeleniumManager with a visible browser and the specified GeckoDriver path.
seleniumManager = SeleniumManager()
# In dieser Zeile wird der Login mit den globalen Zugangsdaten durchgeführt.
# This line logs in using the global credentials.
login(seleniumManager, MAUI_USERNAME, MAUI_PASSWORD, MAUI_AUTHCODE)
# In dieser Zeile wird die Laufzeitvertrags-Seite geöffnet.
# This line opens the Laufzeitvertrag page.
openLaufzeitvertrag(seleniumManager)
# In dieser Zeile wird eine kurze Wartezeit eingefügt.
# This line adds a short delay before scraping begins.
time.sleep(5)
# In dieser Zeile werden alle Daten gescraped und in CSV-Dateien geschrieben.
# This line scrapes all data and writes to CSV files.
scrapeData(seleniumManager)
# In dieser Zeile wird eine letzte Wartezeit eingebaut, bevor der Browser geschlossen wird.
# This line waits 10 seconds before closing the driver.
time.sleep(10)
# In dieser Zeile wird der WebDriver schließlich geschlossen.
# This line closes the WebDriver at the end of the process.
seleniumManager.closeDriver()
# In dieser Ausgabe wird signalisiert, dass das Scraping abgeschlossen ist.
# This print call indicates that scraping is complete.
print("Scraping abgeschlossen.")
......@@ -11,86 +11,91 @@ from models.base_base import BaseBase
from models.deal_deal import DealDeal
from models.option_opti import OptionOpti
#
# Hier wird die Log-Konfiguration festgelegt, damit während des Ablaufs aussagekräftige Zeit- und Fehlermeldungen ausgegeben werden.
# The logging module is configured so that every message contains a timestamp and a log level for easy troubleshooting.
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s: %(message)s"
)
#
# Dieses Verzeichnis verweist auf den Zwischenspeicher, in dem alle CSV-Dateien abgelegt sind.
# The variable “CSV_DIR” stores the absolute path to the cache directory that contains all CSV source files.
CSV_DIR = os.path.join("..", "cache")
#
# Hier werden die einzelnen CSV-Dateien innerhalb des Zwischenspeichers definiert.
# The variables below hold the absolute file paths of the four source CSV files used for the import process.
csvFileCategories = os.path.join(CSV_DIR, "categorys.csv")
csvFilePlans = os.path.join(CSV_DIR, "plans.csv")
csvFileCampaigns = os.path.join(CSV_DIR, "campaigns.csv")
csvFileOptions = os.path.join(CSV_DIR, "options.csv")
#
# Diese Funktion liest eine CSV-Datei vollständig ein und liefert jede Zeile als Wörterbuch zurück.
# This helper function returns the content of a semicolon‑separated CSV file as a list of dictionaries.
def read_csv(path):
# The file is opened with UTF‑8 encoding so non‑ASCII characters are parsed correctly.
with open(path, newline="", encoding="utf-8") as f:
return list(csv.DictReader(f, delimiter=";"))
#
# Hier werden sämtliche CSV-Dateien in Listen von Wörterbüchern eingelesen.
# The four CSV files are loaded into separate lists so their rows are available for further processing.
cat_rows = read_csv(csvFileCategories)
plan_rows = read_csv(csvFilePlans)
camp_rows = read_csv(csvFileCampaigns)
opt_rows = read_csv(csvFileOptions)
#
# Dieses Wörterbuch ordnet jeder Kategorietabelle den zugehörigen Namen für späteres Nachschlagen zu.
# The dictionary “category_name” maps every category id to its human‑readable name for quick look‑ups.
category_name = {r["id"].strip(): r["name"].strip() for r in cat_rows}
#
# Diese Datenstruktur ordnet jeder Plan-ID alle zugehörigen Kampagnenzeilen zu, um schnellen Zugriff zu ermöglichen.
# The default dictionary “campaigns_by_plan” groups all campaign rows by the id of their related plan.
campaigns_by_plan = defaultdict(list)
for c in camp_rows:
# Each campaign is appended to the list identified by its plan id so multiple campaigns can share the same key.
campaigns_by_plan[c["plan"].strip()].append(c)
#
# Diese Datenstruktur ordnet jeder Plan-ID alle zugehörigen Optionszeilen zu, um schnellen Zugriff zu ermöglichen.
# The default dictionary “options_by_plan” groups all option rows by the id of their related plan.
options_by_plan = defaultdict(list)
for o in opt_rows:
# Each option is appended to the list identified by its plan id so multiple options can share the same key.
options_by_plan[o["plan"].strip()].append(o)
#
# Hier wird die Verbindung zur Datenbank geöffnet und eine neue Session erzeugt.
# The variable “mysql” establishes the database connection manager so sessions can be created.
mysql = MysqlManager()
# The variable “session” stores the SQLAlchemy session that will be used for all database operations in this run.
session = mysql.getSession()
#
# Dieses Wörterbuch enthält alle bestehenden Basiseinträge, damit später neue Einträge erkannt werden können.
base_db = {(b.provider_base, b.providercode_base): b
for b in session.query(BaseBase).all()}
# The dictionary “base_db” maps a tuple of provider name and provider code to the corresponding BaseBase object.
base_db = {(b.provider_base, b.providercode_base): b for b in session.query(BaseBase).all()}
#
# Diese verschachtelte Struktur hält alle vorhandenen Deals pro Base-ID, wodurch ein schneller Abgleich ermöglicht wird.
# The default dictionary “deals_db” groups all existing DealDeal rows by their base id for quick comparison later on.
deals_db = defaultdict(dict)
for d in session.query(DealDeal).all():
# Each deal is added to the inner dictionary that is addressed by the base id so provider codes become the second‑level keys.
deals_db[d.base_deal][d.providercode_deal] = d
#
# Diese verschachtelte Struktur hält alle vorhandenen Optionen pro Base-ID, um später Stop- und Reaktivierungslogik anzuwenden.
# The default dictionary “opts_db” groups all existing OptionOpti rows by their base id for quick comparison later on.
opts_db = defaultdict(dict)
for o in session.query(OptionOpti).all():
# Each option is added to the inner dictionary that is addressed by the base id so provider codes become the second‑level keys.
opts_db[o.base_opti][o.providercode_opti] = o
#
# Hier wird der aktuelle Zeitpunkt einmalig festgelegt, um ihn konsistent für alle neu erzeugten Datensätze zu verwenden.
# The variable “now” stores the current timestamp so all new rows share an identical creation and update time.
now = datetime.datetime.now()
#
# Diese Liste sammelt alle neu anzulegenden Basiseinträge, damit sie in einem Schritt geschrieben werden können.
# The list “new_bases” collects BaseBase objects that need to be inserted because they do not yet exist in the database.
new_bases = []
for p in plan_rows:
# The variable “prov_base” combines several CSV columns to form the provider_base value used in the database.
prov_base = f"Freenet | {p['provider'].strip()} | {p['rahmen'].strip()}"
# The variable “key” uniquely identifies a base by provider_base and provider code so duplicates can be detected.
key = (prov_base, p["id"].strip())
# This branch creates a new BaseBase object when the combination of provider_base and provider code is unknown.
if key not in base_db:
# The variable “b” stores the new BaseBase object that is populated with basic attributes and timestamps.
b = BaseBase(
provider_base = prov_base,
providercode_base = p["id"].strip(),
......@@ -101,31 +106,34 @@ for p in plan_rows:
new_bases.append(b)
base_db[key] = b
#
# Hier werden alle neu erkannten Basiseinträge in einem einzigen Datenbankvorgang gespeichert.
# This branch writes all new BaseBase objects to the database in one bulk operation and refreshes their primary keys.
if new_bases:
session.add_all(new_bases)
session.flush()
logging.info("Inserted %d new bases", len(new_bases))
#
# Diese verschachtelten Mengen erfassen für jede Base-ID die in diesem Lauf gewünschten Deals und Optionen.
# The default dictionaries below keep track of all deals and options that should exist after the import run.
desired_deals = defaultdict(set)
desired_opts = defaultdict(set)
#
# Diese Listen sammeln alle Datensätze, die per INSERT IGNORE neu geschrieben oder aktualisiert werden sollen.
# The two lists below accumulate dictionaries that will later be used for INSERT IGNORE bulk operations.
deal_rows_insert = []
opt_rows_insert = []
for p in plan_rows:
# The variable “prov_base” recomputes the provider_base value so it matches the previously used key.
prov_base = f"Freenet | {p['provider'].strip()} | {p['rahmen'].strip()}"
# The variable “base_obj” retrieves the corresponding BaseBase object from the in‑memory cache.
base_obj = base_db[(prov_base, p["id"].strip())]
# The variable “b_id” stores the primary key of the current BaseBase row.
b_id = base_obj.id_base
# The variable “price” converts the plan price from a string into a Decimal object for precise arithmetic.
price = Decimal(p["price"].strip() or "0.00")
#
# Dieser Block fügt den obligatorischen Standard-Deal ohne Kampagnenkennung hinzu.
# The mandatory default deal without campaign code is added to the desired set and the insert list.
desired_deals[b_id].add("")
deal_rows_insert.append({
"provisiongroup_deal": 1,
......@@ -139,9 +147,10 @@ for p in plan_rows:
"updated_deal": now
})
#
# Dieser Block verarbeitet alle Kampagnen zum aktuellen Plan und fügt sie der Wunschliste hinzu.
# Every campaign row belonging to the current plan is processed and prepared for insertion.
for c in campaigns_by_plan[p["id"].strip()]:
# The variable “code” prefixes the campaign id with “A” so it conforms to the provider_code format.
code = f"A{c['id'].strip()}"
desired_deals[b_id].add(code)
deal_rows_insert.append({
......@@ -156,9 +165,10 @@ for p in plan_rows:
"updated_deal": now
})
#
# Dieser Block fügt alle Optionen zum aktuellen Plan der Wunschliste hinzu und bereitet die Insert-Zeilen vor.
# Every option row belonging to the current plan is processed and prepared for insertion.
for o in options_by_plan[p["id"].strip()]:
# The variable “code_opt” holds the raw option id string so it can be used as provider code.
code_opt = o["id"].strip()
desired_opts[b_id].add(code_opt)
opt_rows_insert.append({
......@@ -179,92 +189,87 @@ for p in plan_rows:
"updated_opti": now
})
#
# In diesem Schritt werden doppelte Deal- und Optionszeilen anhand ihrer Schlüsselwerte entfernt.
# Duplicate deals are removed by transforming the list into a dictionary keyed by base id and provider code.
deal_rows_insert = list({(r["base_deal"], r["providercode_deal"]): r for r in deal_rows_insert}.values())
# Duplicate options are removed by transforming the list into a dictionary keyed by base id, provider code, and category.
opt_rows_insert = list({
(r["base_opti"], r["providercode_opti"], r["providercategory_opti"]): r
for r in opt_rows_insert
(r["base_opti"], r["providercode_opti"], r["providercategory_opti"]): r for r in opt_rows_insert
}.values())
#
# Diese Listen sammeln Datensätze, deren Status auf gestoppt oder reaktiviert gesetzt werden muss.
# The lists below collect maps that will later be used to stop or reactivate existing deals and options.
stop_deals, react_deals = [], []
stop_opts, react_opts = [], []
for (prov, _), b in base_db.items():
# This branch skips bases that were not imported from Freenet.
if not prov.startswith("Freenet"):
continue
# The variable “b_id” stores the primary key of the base row currently being checked.
b_id = b.id_base
# The variables “wantD” and “haveD” hold the sets of desired and existing deals for the current base.
wantD = desired_deals.get(b_id, set())
haveD = deals_db.get(b_id, {})
# This loop checks every existing deal so its stop status can be updated when necessary.
for code, obj in haveD.items():
# This branch marks a stopped deal for reactivation when the code is desired but the stop date is set.
if code in wantD and obj.stops_deal is not None:
react_deals.append({"id_deal": obj.id_deal,
"stops_deal": None,
"updated_deal": now})
react_deals.append({"id_deal": obj.id_deal, "stops_deal": None, "updated_deal": now})
# This branch marks an active deal for stopping when the code is no longer desired.
if code not in wantD and obj.stops_deal is None:
stop_deals.append({"id_deal": obj.id_deal,
"stops_deal": now,
"updated_deal": now})
stop_deals.append({"id_deal": obj.id_deal, "stops_deal": now, "updated_deal": now})
# The variables “wantO” and “haveO” hold the sets of desired and existing options for the current base.
wantO = desired_opts.get(b_id, set())
haveO = opts_db.get(b_id, {})
# This loop checks every existing option so its stop status can be updated when necessary.
for code, obj in haveO.items():
# This branch marks a stopped option for reactivation when the code is desired but the stop date is set.
if code in wantO and obj.stops_opti is not None:
react_opts.append({"id_opti": obj.id_opti,
"stops_opti": None,
"updated_opti": now})
react_opts.append({"id_opti": obj.id_opti, "stops_opti": None, "updated_opti": now})
# This branch marks an active option for stopping when the code is no longer desired.
if code not in wantO and obj.stops_opti is None:
stop_opts.append({"id_opti": obj.id_opti,
"stops_opti": now,
"updated_opti": now})
#
# Dieser Block schreibt alle gewünschten Deals per INSERT IGNORE in die Datenbank.
session.execute(
mysql_insert(DealDeal.__table__).prefix_with("IGNORE"),
deal_rows_insert
)
stop_opts.append({"id_opti": obj.id_opti, "stops_opti": now, "updated_opti": now})
# All prepared deal rows are written to the database using INSERT IGNORE so duplicates are silently skipped.
session.execute(mysql_insert(DealDeal.__table__).prefix_with("IGNORE"), deal_rows_insert)
logging.info("INSERT IGNORE'd %d deals", len(deal_rows_insert))
#
# Dieser Block schreibt alle gewünschten Optionen per INSERT IGNORE in die Datenbank.
session.execute(
mysql_insert(OptionOpti.__table__).prefix_with("IGNORE"),
opt_rows_insert
)
# All prepared option rows are written to the database using INSERT IGNORE so duplicates are silently skipped.
session.execute(mysql_insert(OptionOpti.__table__).prefix_with("IGNORE"), opt_rows_insert)
logging.info("INSERT IGNORE'd %d options", len(opt_rows_insert))
#
# Dieser Block aktualisiert alle Deals, die jetzt gestoppt werden müssen.
# All deals marked for stopping are updated in bulk so their stop dates are set in a single efficient query.
if stop_deals:
session.bulk_update_mappings(DealDeal, stop_deals)
logging.info("Stopped %d deals", len(stop_deals))
#
# Dieser Block aktualisiert alle Deals, die wieder reaktiviert werden müssen.
# All deals marked for reactivation are updated in bulk so their stop dates are cleared in a single efficient query.
if react_deals:
session.bulk_update_mappings(DealDeal, react_deals)
logging.info("Reactivated %d deals", len(react_deals))
#
# Dieser Block aktualisiert alle Optionen, die jetzt gestoppt werden müssen.
# All options marked for stopping are updated in bulk so their stop dates are set in a single efficient query.
if stop_opts:
session.bulk_update_mappings(OptionOpti, stop_opts)
logging.info("Stopped %d options", len(stop_opts))
#
# Dieser Block aktualisiert alle Optionen, die wieder reaktiviert werden müssen.
# All options marked for reactivation are updated in bulk so their stop dates are cleared in a single efficient query.
if react_opts:
session.bulk_update_mappings(OptionOpti, react_opts)
logging.info("Reactivated %d options", len(react_opts))
#
# Hier werden sämtliche Änderungen dauerhaft in der Datenbank gespeichert.
# The commit call persists every change made in this run so all inserts and updates become permanent.
session.commit()
#
# Zum Abschluss wird die Session geschlossen, um Ressourcen freizugeben.
# The session is closed to release database connections and other resources held by SQLAlchemy.
session.close()
logging.info("Import-Lauf abgeschlossen.")
logging.info("Import run finished successfully.")
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
sys.path.append("..")
# This script scans a local cache directory for PDF files belonging to base tariffs, uploads the PDFs to S3, writes the resulting URLs back into the MySQL database, and logs progress as well as errors to stdout.
import sys; sys.path.append("..")
import os
import datetime
from manager.S3Manager import S3Manager
......@@ -12,98 +9,111 @@ from models.deal_deal import DealDeal
from models.option_opti import OptionOpti
from models.provisiongroup_pgro import ProvisiongroupPgro # zwingend, um Abhängigkeits-Mapping zu initialisieren
#
# Dieses Verzeichnis enthält sämtliche PDF-Dateien für den Upload.
# The variable "cacheDir" stores the file‑system path that contains the PDF files waiting for upload.
cacheDir = "../cache"
#
# Dieses Objekt übernimmt das Hochladen der Dateien in den S3-Bucket und liefert die endgültige URL.
# The variable "s3Manager" holds an instance that encapsulates S3 upload functionality.
s3Manager = S3Manager()
#
# Diese Datenbank-Session ermöglicht Abfragen und Aktualisierungen innerhalb der MySQL-Datenbank.
# The variable "dbSession" stores a SQLAlchemy session used to query and update the MySQL database.
dbSession = MysqlManager().getSession()
#
# Diese Liste sammelt alle PDF-Dateinamen im Cache-Verzeichnis.
# The variable "pdfFiles" gathers all file names inside the cache directory whose names end with the ".pdf" extension, case‑insensitive.
pdfFiles = [f for f in os.listdir(cacheDir) if f.lower().endswith(".pdf")]
#
# Diese Menge speichert alle eindeutigen Basis-IDs, die durch Suffix-Prüfung ermittelt wurden.
# The variable "pdfIdSet" collects distinct base identifiers by stripping the *_flyer or *_pib suffix from each file name stem.
pdfIdSet = set()
for name in pdfFiles:
# The variable "stem" stores the file name without its extension and is converted to lowercase for uniformity.
stem = name[:-4].lower()
# This branch adds the identifier to the set when the file name ends with the flyer suffix.
if stem.endswith("_flyer"):
pdfIdSet.add(stem[:-6])
# This branch adds the identifier to the set when the file name ends with the PIB suffix.
elif stem.endswith("_pib"):
pdfIdSet.add(stem[:-4])
#
# Dieser Block beendet das Skript, wenn keine geeigneten PDF-Dateien vorhanden sind.
# This branch terminates the script early when no matching PDF pairs were found in the cache directory.
if not pdfIdSet:
print(f"INFO: Keine PDF-Paare in '{cacheDir}' gefunden.")
dbSession.close()
sys.exit(0)
#
# Diese Schleife verarbeitet jede erkannte Basis-ID in sortierter Reihenfolge.
# This loop iterates over each distinct base identifier in sorted order to process the associated PDFs.
for currentId in sorted(pdfIdSet):
# The print statement marks the beginning of processing for the current identifier.
print(f"\n--- Verarbeitung ID: {currentId} ---")
#
# Diese Abfrage liefert alle BaseBase-Datensätze, um Mehrfachtreffer sicher zu unterstützen.
# The variable "baseRecords" retrieves all BaseBase rows with a matching provider code so that multiple matches are processed consistently.
baseRecords = dbSession.query(BaseBase).filter_by(providercode_base=currentId).all()
# This branch skips the current identifier when no matching BaseBase row exists.
if not baseRecords:
print(f"WARNUNG: Kein BaseBase-Eintrag für providercode_base='{currentId}'.")
continue
#
# Dieser Pfad verweist auf die potenzielle Flyer-Datei der aktuellen Basis-ID.
# The variable "flyerPath" composes the absolute path to the flyer PDF for the current identifier.
flyerPath = os.path.join(cacheDir, f"{currentId}_flyer.pdf")
#
# Dieser Pfad verweist auf die potenzielle PIB-Datei der aktuellen Basis-ID.
# The variable "pibPath" composes the absolute path to the PIB PDF for the current identifier.
pibPath = os.path.join(cacheDir, f"{currentId}_pib.pdf")
#
# Diese Variable hält die hochgeladene Flyer-URL oder bleibt None, falls kein Upload erfolgte.
# The variable "flyerUrl" will store the public S3 URL of the flyer PDF or remain None when the upload fails or the file does not exist.
flyerUrl = None
# This branch uploads the flyer PDF when the file exists.
if os.path.exists(flyerPath):
# The variable "flyerKey" determines the destination key inside the S3 bucket.
flyerKey = f"flyers/{currentId}_flyer.pdf"
# The variable "flyerUrl" receives the URL returned by the upload method.
flyerUrl = s3Manager.uploadFile(flyerPath, flyerKey)
# This branch prints an error message when the upload failed and no URL was returned.
if not flyerUrl:
print(f"FEHLER: Flyer-Upload fehlgeschlagen für ID {currentId}")
#
# Diese Variable hält die hochgeladene PIB-URL oder bleibt None, falls kein Upload erfolgte.
# The variable "pibUrl" will store the public S3 URL of the PIB PDF or remain None when the upload fails or the file does not exist.
pibUrl = None
# This branch uploads the PIB PDF when the file exists.
if os.path.exists(pibPath):
# The variable "pibKey" determines the destination key inside the S3 bucket.
pibKey = f"pibs/{currentId}_pib.pdf"
# The variable "pibUrl" receives the URL returned by the upload method.
pibUrl = s3Manager.uploadFile(pibPath, pibKey)
# This branch prints an error message when the upload failed and no URL was returned.
if not pibUrl:
print(f"FEHLER: PIB-Upload fehlgeschlagen für ID {currentId}")
#
# Diese Schleife aktualisiert jede gefundene Base-Zeile, um Flyer- und PIB-URLs konsistent zu setzen.
# This loop updates each BaseBase record so that both flyer and PIB URLs are stored without overwriting existing values.
for base in baseRecords:
# This branch writes the flyer URL into the database row when no URL has been stored before and a new URL is available.
if base.flyerurl_base is None and flyerUrl:
base.flyerurl_base = flyerUrl
base.updated_base = datetime.datetime.now()
print(f"INFO: flyerurl_base gesetzt: {flyerUrl}")
# This branch writes the PIB URL into the database row when no URL has been stored before and a new URL is available.
if base.piburl_base is None and pibUrl:
base.piburl_base = pibUrl
base.updated_base = datetime.datetime.now()
print(f"INFO: piburl_base gesetzt: {pibUrl}")
#
# Dieser Aufruf speichert alle Änderungen für die aktuelle Basis-ID atomar in der Datenbank.
# The commit call atomically persists all changes performed for the current identifier.
dbSession.commit()
#
# Hier wird die Session geschlossen, sobald alle Basis-IDs verarbeitet wurden.
# The database session is closed after all identifiers have been processed.
dbSession.close()
#
# Diese Meldung bestätigt das erfolgreiche Ende des gesamten Upload-Vorgangs.
# The print statement confirms that the entire upload sequence finished successfully.
print("INFO: Upload-Vorgang abgeschlossen.")
\ No newline at end of file
#!/bin/bash
# Dieser Wrapper wechselt ins Verzeichnis /maui/commands und startet das
# gewünschte Python-Skript (mit python3), sofern nicht bereits eine Instanz
# dieses Skripts läuft. Gleichzeitig werden alle Ausgaben in zwei getrennten
# Logfiles im Verzeichnis /maui/logs abgelegt, wobei jedes Skript einen
# eigenen Unterordner erhält (benannt nach dem Skriptnamen ohne Erweiterung)
# und die Logfiles die Namen im Format
# - L_yyyymmdd-hhiiss.txt für die Standardausgabe,
# - E_yyyymmdd-hhiiss.err für die Fehlermeldung
# tragen. Logfiles, die älter als 24 Stunden (1440 Minuten) sind, werden
# automatisch gelöscht.
#--- Parameterprüfung ---
# In dieser Abfrage wird überprüft, ob mindestens ein Parameter übergeben wurde.
# This script guarantees that only one instance of a specified Python job runs simultaneously, captures its standard and error output in timestamped log files, and notifies a monitoring endpoint when errors occur.
# The following conditional branch checks whether at least one positional argument has been provided; if not, usage information is printed and the script terminates with exit status 1.
if [ "$#" -lt 1 ]; then
# Hier wird ein Hinweis ausgegeben, wie dieses Skript zu nutzen ist, wenn nicht
# genügend Parameter übergeben wurden.
# The echo command prints usage instructions when no job file is supplied.
echo "Usage: $0 <jobfilename> [arguments...]"
# Dieser Befehl beendet das Skript mit einem Fehlercode.
# The script terminates with exit status 1 when the required argument is missing.
exit 1
fi
# Diese Variable speichert den ersten übergebenen Parameter als Namen des
# Python-Skripts.
# The variable “jobname” stores the first positional argument as the Python job file name.
jobname="$1"
# Dieser Befehl entfernt den ersten Parameter aus der Parameterliste, damit
# weitere Argumente optional weiterverarbeitet werden können.
# The shift statement removes the first positional argument so that any additional arguments remain accessible.
shift
#--- Arbeitsverzeichnis und Log-Verzeichnis festlegen ---
# Diese Variable legt das Arbeitsverzeichnis fest, in dem sich die
# Python-Skripte befinden.
# The variable “WORKDIR” defines the directory where the Python job files reside.
WORKDIR="/maui/commands"
# Diese Variable legt das Hauptverzeichnis für die Logdateien fest.
# The variable “LOG_ROOT” defines the root directory where log folders will be created.
LOG_ROOT="/maui/logs"
# Diese Variable ermittelt aus dem übergebenen Skriptnamen
# (z. B. rawFromBloomberg.py) den Basisteil (rawFromBloomberg).
# The variable “job_base” extracts the base name of the job without its extension.
job_base=$(basename "$jobname" .py)
# Diese Variable bildet den Pfad für das individuellen Logverzeichnis, basierend
# auf dem Basisteil des Skriptnamens.
# The variable “LOG_DIR” composes the path to the job-specific log directory.
LOG_DIR="$LOG_ROOT/$job_base"
# Dieser Befehl stellt sicher, dass das Haupt-Logverzeichnis und das Verzeichnis
# für das aktuelle Skript existieren, und legt sie gegebenenfalls an.
# The mkdir command ensures that the root log directory and the job-specific directory exist, creating them if necessary.
mkdir -p "$LOG_DIR"
# Dieser Befehl findet und löscht alle Logdateien im spezifischen Verzeichnis,
# die älter als 24 Stunden (1440 Minuten) sind.
# The find command removes log files older than twenty-four hours (1 440 minutes) from the job-specific directory.
find "$LOG_DIR" -type f -mmin +1440 -delete
#--- Prozessüberprüfung ---
# Diese Variable speichert die Prozess-ID des aktuell ausgeführten Skripts,
# damit es sich nicht selbst erkennt.
# The variable “current_pid” stores the process identifier of the currently running wrapper instance.
current_pid=$$
# Diese Variable hält den Namen dieses Wrapperskripts (cron.sh), um ihn ebenfalls
# von der Prozessliste auszuschließen.
# The variable “wrapper_name” stores the file name of this wrapper to exclude it from the process search.
wrapper_name=$(basename "$0")
# Diese Variable legt fest, nach welchem exakten Aufrufmuster
# (python3 <jobname>) in der Prozessliste gesucht werden soll.
# The variable “pattern” stores the exact command signature that identifies a running job process.
pattern="python3 $jobname"
# In dieser Variable werden alle zum Muster passenden Prozess-IDs gespeichert,
# wobei Zeilen des Wrappers und greps ausgeschlossen werden.
# The variable “running” captures the process identifiers that match the command signature while excluding grep and wrapper processes.
running=$(ps ax -o pid,cmd | grep "$pattern" | grep -v grep | grep -v "$wrapper_name" | awk '{print $1}')
# Diese Abfrage prüft, ob ein passender Prozess bereits läuft.
# The following conditional branch checks whether at least one matching process identifier was found; if a job is already running, the script informs the user and terminates with exit status 0.
if [ -n "$running" ]; then
# Hier wird der Nutzer informiert, dass der entsprechende Job bereits ausgeführt
# wird, und ein erneuter Start verhindert.
# The echo command informs the user that the requested job is already running.
echo "Job '$jobname' läuft bereits (PID(s): $running). Abbruch."
# Das Skript wird hier mit Exit-Code 0 (ohne Fehler) beendet, um keine neue
# Instanz zu starten.
# The script terminates with exit status 0 to prevent a second instance from starting.
exit 0
fi
#--- Logging vorbereiten und Job starten ---
# Diese Variable erzeugt einen Zeitstempel im Format yyyymmdd-hhiiss
# (z. B. 20250413-114530), um eindeutige Logdateien zu erstellen.
# The variable “timestamp” records the current date and time in YYYYMMDD-HHMMSS format.
timestamp=$(date "+%Y%m%d-%H%M%S")
# Diese Variable bildet den vollständigen Pfad zur Logdatei für die Standardausgabe.
# The variable “STDOUT_LOG” composes the full path to the log file for standard output.
STDOUT_LOG="$LOG_DIR/L_${timestamp}.txt"
# Diese Variable bildet den vollständigen Pfad zur Logdatei für die Fehlermeldungen.
# The variable “ERROR_LOG” composes the full path to the log file for error output.
ERROR_LOG="$LOG_DIR/E_${timestamp}.err"
# Dieser Befehl wechselt in das festgelegte Arbeitsverzeichnis oder bricht mit
# Fehlermeldung ab, falls es nicht erreichbar ist.
# The cd command changes into the working directory or aborts with an error message if the directory is inaccessible.
cd "$WORKDIR" || { echo "Arbeitsverzeichnis $WORKDIR nicht erreichbar." >&2; exit 1; }
# Dieser Befehl führt das Python-Skript aus und leitet stdout in das L_-Logfile
# und stderr in das E_-Logfile um.
# The python3 command executes the specified job, redirecting standard output and error output to dedicated log files.
python3 "$jobname" "$@" > "$STDOUT_LOG" 2> "$ERROR_LOG"
# Fehler senden bei Inhalt: Jobname, Zeilenumbruch, Fehler
# The following conditional branch checks whether the error log file contains data; if errors exist, the job name and error content are posted to the monitoring endpoint.
if [ -s "$ERROR_LOG" ]; then
payload="$jobname
$(<"$ERROR_LOG")"
# The variable “payload” concatenates the job name and error log content for notification.
payload="$jobname : $(<"$ERROR_LOG")"
# The curl command posts the payload to the ntfy.sh endpoint for error reporting.
curl -s -X POST https://ntfy.sh/itmaxDebug -d "$payload"
fi
import sys
sys.path.append("..")
import sys; sys.path.append("..")
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.pool import QueuePool
import config.MysqlConfig as DatabaseConfig
from sshtunnel import SSHTunnelForwarder
# Diese Klasse verwaltet die MySQL‑Verbindung und nutzt optional einen SSH‑Tunnel.
# Die Konfiguration stammt weiterhin aus config.MysqlConfig; es wird lediglich
# der erweiterte Engine‑Teil (QueuePool, Timeouts u. a.) integriert.
# This class manages the MySQL connection, optionally establishes an SSH tunnel, and exposes a ready-to-use SQLAlchemy session.
class MysqlManager:
# The constructor loads configuration, conditionally creates an SSH tunnel, builds an SQLAlchemy engine with a queue pool, and instantiates the first session.
def __init__(self):
# ───────────────────────────────────────────────────────
# Konfiguration aus dem Modul laden
# ───────────────────────────────────────────────────────
# The variable “dbConfig” stores database connection parameters that are loaded from the configuration module.
self.dbConfig = {
"host": DatabaseConfig.MYSQL_HOST,
"user": DatabaseConfig.MYSQL_USER,
......@@ -25,10 +20,10 @@ class MysqlManager:
"port": DatabaseConfig.MYSQL_PORT,
}
# ───────────────────────────────────────────────────────
# Optionalen SSH‑Tunnel aufbauen
# ───────────────────────────────────────────────────────
# This conditional branch builds an SSH tunnel when the configuration flag USE_SSH_TUNNEL is True; otherwise, it uses the direct database host and port.
if getattr(DatabaseConfig, "USE_SSH_TUNNEL", False):
# The variable “sshTunnel” opens a forwarder that connects the local port to the remote MySQL host through SSH.
self.sshTunnel = SSHTunnelForwarder(
(DatabaseConfig.SSH_HOST, DatabaseConfig.SSH_PORT),
ssh_username=DatabaseConfig.SSH_USERNAME,
......@@ -36,36 +31,46 @@ class MysqlManager:
remote_bind_address=(self.dbConfig["host"], self.dbConfig["port"]),
)
self.sshTunnel.start()
# The variables “db_host” and “db_port” point to the locally forwarded endpoint when the tunnel is active.
db_host = "127.0.0.1"
db_port = self.sshTunnel.local_bind_port
else:
# The variable “sshTunnel” is set to None when no tunnel is required.
self.sshTunnel = None
# The variables “db_host” and “db_port” point to the remote MySQL server when no tunnel is used.
db_host = self.dbConfig["host"]
db_port = self.dbConfig["port"]
# ───────────────────────────────────────────────────────
# SQLAlchemy‑Engine mit QueuePool & Timeouts erstellen
# ───────────────────────────────────────────────────────
# The variable “engine” creates an SQLAlchemy engine that uses a QueuePool and applies timeout settings taken from the configuration.
self.engine = create_engine(
f"mysql+pymysql://{self.dbConfig['user']}:{self.dbConfig['password']}@{db_host}:{db_port}/{self.dbConfig['database']}",
echo=False,
poolclass=QueuePool,
pool_size=getattr(DatabaseConfig, "POOL_SIZE", 1),
max_overflow=getattr(DatabaseConfig, "MAX_OVERFLOW", 0),
pool_recycle=getattr(DatabaseConfig, "POOL_RECYCLE", 3600), # Sekunden
pool_recycle=getattr(DatabaseConfig, "POOL_RECYCLE", 3600),
pool_pre_ping=True,
connect_args={"connect_timeout": getattr(DatabaseConfig, "CONNECT_TIMEOUT", 30)},
)
# Session Factory sofort initialisieren
# The variable “dbSession” stores the first session instance created from the session factory bound to the engine.
self.dbSession = sessionmaker(bind=self.engine)()
# Gibt die aktuelle Session zurück
# This method returns the current SQLAlchemy session so callers can interact with the database.
def getSession(self):
# The session instance is returned without creating a new one.
return self.dbSession
# Schließt Session und SSH‑Tunnel (falls vorhanden)
# This method closes the current session and shuts down the SSH tunnel when it was created.
def close(self):
# The session is closed to release database resources.
self.dbSession.close()
# This conditional branch stops the SSH tunnel when it exists.
if self.sshTunnel:
self.sshTunnel.stop()
"""
Hauptanwendung (Manager-Kontext)
Startet die Flask-App, erzeugt einen WebManager und registriert
alle Blueprints zentral.
**Neu**
Alle Endpunkte verlangen jetzt zwingend den Query-Parameter
?token=12345
Fehlt der Parameter oder stimmt der Wert nicht, erhält der Client
HTTP/1.1 401 Unauthorized
{"status": "NOK", "message": "Ungültiger oder fehlender Token."}
"""
from __future__ import annotations
import sys
sys.path.append("..") # Projekt-Root im Suchpfad registrieren
import sys; sys.path.append("..")
from flask import Flask, request, jsonify
from manager.MysqlManager import MysqlManager
from models.token_toke import TokenToke
from routes.HealtCheckRouter import blueprint as health_router
from routes.BaseRouter import blueprint as tarifs_router
from routes.EeccxRouter import blueprint as eeccx_router
# --------------------------------------------------------------------------- #
# WebManager: registriert sämtliche Blueprints
# --------------------------------------------------------------------------- #
# This class bundles blueprint registration so that all route collections are attached to the Flask application.
class WebManager:
"""Registriert Blueprints und bündelt weitere Infrastruktur."""
# The constructor assigns the provided Flask instance and calls the private registration helper.
def __init__(self, app: Flask) -> None:
self.app = app
self._register_blueprints()
# This helper method iterates over all blueprints and registers each of them on the Flask application.
def _register_blueprints(self) -> None:
"""Alle Blueprint-Objekte an der App anmelden."""
for bp in (health_router, tarifs_router, eeccx_router):
self.app.register_blueprint(bp)
# --------------------------------------------------------------------------- #
# App-Instanz & globale Token-Prüfung
# --------------------------------------------------------------------------- #
TOKEN_VALUE = "12345" # Erlaubter Token-Wert
# A new Flask application instance is created and handed to the WebManager for blueprint registration.
app = Flask(__name__)
WebManager(app)
# This handler executes before every request to enforce the compulsory token parameter and validate it against the database.
@app.before_request
def _require_token():
"""
Globale Pre-Request-Hook:
Schlägt fehl, wenn der Query-Parameter ?token=12345
nicht exakt vorhanden ist.
"""
# This branch allows requests for static files to proceed without token validation.
if request.endpoint == "static":
# Flask-static-Files nicht schützen
return None
# The variable “token” stores the value of the ?token query parameter or None when absent.
token = request.args.get("token")
if token != TOKEN_VALUE:
# This branch rejects the request when the token parameter is missing.
if not token:
return (
jsonify({"message": "Please enter a valid token."}),
jsonify({"status": "NOAUTH", "message": "Please enter a valid token."}),
401,
)
# The variable “session” holds a new SQLAlchemy session obtained from the MysqlManager.
session = MysqlManager().getSession()
try:
# The variable “token_exists” evaluates to True when a matching token record is found in the database.
token_exists = (
session.query(TokenToke)
.filter_by(token_toke=token)
.first()
is not None
)
finally:
session.close()
# This branch rejects the request when the supplied token does not exist in the database.
if not token_exists:
return (
jsonify({"status": "NOAUTH", "message": "Please enter a valid token."}),
401,
)
# --------------------------------------------------------------------------- #
# Startpunkt
# --------------------------------------------------------------------------- #
# The application starts on all network interfaces on port 80 when the module is executed directly.
if __name__ == "__main__":
# Server auf allen Interfaces, Port 80 starten
app.run(host="0.0.0.0", port=80)
from sqlalchemy import Column, Integer, String, DateTime
from models._system import Base
class TokenToke(Base):
__tablename__ = 'token_toke'
id_toke = Column(
Integer,
primary_key=True,
autoincrement=True
)
token_toke = Column(
String(255),
nullable=False
)
owner_toke = Column(
String(255),
nullable=False
)
created_toke = Column(
DateTime,
nullable=False
)
# MAUI Data Toolkit
## Tutorials
....
....
....
## JupyterLab
To further develop or test this project use jupyter lab. Take care that "notebooks" are only concepts with your pc as python environment. Production code runs only inside the docker environment (stored in e.g. "commands" or "manager" folder).
......@@ -8,31 +13,18 @@ To further develop or test this project use jupyter lab. Take care that "noteboo
jupyter lab
```
## Docker & ECR
## Docker
Use Docker to deploy this package in a production environment. Log in to Amazon ECR with the AWS CLI:
```bash
aws ecr get-login-password --region eu-central-1 | docker login --username AWS --password-stdin ???
```
Build, tag, and push the image:
Use Docker to build this package for a production environment.
```bash
docker build --platform linux/amd64 -t maui:latest .
docker tag maui:latest ???
docker push ???
```
To pull and run the container, use:
```bash
docker pull ???
docker run -it -d --restart always -p 80:80 ???
docker run -it -d --restart always -p 80:80 maui:latest
```
Alternatively, for local development with mounted volumes:
```bash
docker run -it \
-v ./commands:/maui/commands \
......
"""
Tarifs-Summary-Router
Stellt zwei Endpunkte bereit
GET /base – Übersicht aller aktiven Basis-Tarife
GET /base/<id> – Vollständiger Datensatz für ein Base-Objekt
(inkl. Deals & Options)
Die Detail-Route enthält die komplette, korrigierte Logik zum
Zusammen­bauen der Options-Hierarchie, so dass Duplikate – etwa wenn
derselbe O-Code in mehreren Gruppen auftaucht – **nicht** mehr verloren
gehen.
"""
# --------------------------------------------------------------------------- #
# Standard- / Drittanbieter-Bibliotheken
# --------------------------------------------------------------------------- #
from __future__ import annotations
import sys
import sys; sys.path.append("..")
from typing import Any, Dict, List, Tuple
sys.path.append("..") # Projekt-Root für Manager & Models hinzufügen
from flask import Blueprint, jsonify, abort
from sqlalchemy.orm import Session
from sqlalchemy import func
# --------------------------------------------------------------------------- #
# Eigene Module
# --------------------------------------------------------------------------- #
from manager.MysqlManager import MysqlManager
from models.base_base import BaseBase
from models.deal_deal import DealDeal
from models.option_opti import OptionOpti
# --------------------------------------------------------------------------- #
# Blueprint
# --------------------------------------------------------------------------- #
# The blueprint instance is created with the module name stripped of dots so
# registering it never triggers a ValueError.
blueprint = Blueprint(__name__.rsplit(".", 1)[-1], __name__)
# --------------------------------------------------------------------------- #
# Hilfsfunktionen
# --------------------------------------------------------------------------- #
# This function builds the full JSON response for a given base object id,
# returning None when the id does not exist.
def _build_base_response(session: Session, base_id: int) -> Dict[str, Any] | None:
"""
Erzeugt die vollständige JSON-Struktur für ein gegebenes Base-Objekt
(inklusive Deals sowie hierarchisch aufgebauter Optionen).
"""
# --------------------------------------------------------------------- #
# Basis-Datensatz laden
# --------------------------------------------------------------------- #
# This query loads the base record that matches the requested id or returns None when no match is found.
base_record: BaseBase | None = (
session.query(BaseBase)
.filter_by(id_base=base_id)
.one_or_none()
)
if base_record is None: # nicht gefunden
# This conditional branch exits early when the requested base id does not exist.
if base_record is None:
return None
# --------------------------------------------------------------------- #
# Deals sammeln
# --------------------------------------------------------------------- #
# This query loads all deal records that belong to the current base object.
deal_records: List[DealDeal] = (
session.query(DealDeal)
.filter_by(base_deal=base_record.id_base)
.all()
)
# This comprehension converts the SQLAlchemy deal objects into plain Python dictionaries ready for JSON serialization.
deals: List[Dict[str, Any]] = [
{
"id": d.id_deal,
......@@ -88,24 +56,21 @@ def _build_base_response(session: Session, base_id: int) -> Dict[str, Any] | Non
for d in deal_records
]
# --------------------------------------------------------------------- #
# Optionen laden
# --------------------------------------------------------------------- #
# This query loads every option that belongs to the current base object.
opti_records: List[OptionOpti] = (
session.query(OptionOpti)
.filter_by(base_opti=base_record.id_base)
.all()
)
# --------------------------------------------------------------------- #
# Optionen in Nodes verwandeln
# --------------------------------------------------------------------- #
# option_nodes = Liste aller (node_dict, parent_code)
# category_nodes = Mapping Gruppen-Code („G…“) → node_dict
# The two collections below hold option nodes and category nodes so we can easily assemble the option hierarchy.
option_nodes: List[Tuple[Dict[str, Any], str | None]] = []
category_nodes: Dict[str, Dict[str, Any]] = {}
# This loop converts each option database record into a node dictionary and remembers its parent relationship.
for o in opti_records:
# The node dictionary contains all option attributes plus a list for potential child options.
node: Dict[str, Any] = {
"id": o.id_opti,
"provisiongroup": o.provisiongroup_opti,
......@@ -121,45 +86,42 @@ def _build_base_response(session: Session, base_id: int) -> Dict[str, Any] | Non
"provision4": float(o.provision4_opti),
"created": o.created_opti.isoformat() if o.created_opti else None,
"updated": o.updated_opti.isoformat() if o.updated_opti else None,
"items": [], # Platz für Kind-Optionen
"items": [],
}
# Gruppen-Codes beginnen mit „G“ – merken, damit wir Kinder anhängen können
# This conditional branch stores nodes whose provider code begins with “G” so children can later be attached to them.
if o.providercode_opti.startswith("G"):
category_nodes[o.providercode_opti] = node
# Merken, unter welcher Kategorie/Gruppe dieses Element hängt
# Each tuple in option_nodes keeps the node itself and the code of its parent option or group.
option_nodes.append((node, o.providercategory_opti))
# --------------------------------------------------------------------- #
# Parent-/Child-Verknüpfung herstellen
# --------------------------------------------------------------------- #
# This dictionary collects nodes that have no valid parent so they can be returned as top‑level entries grouped by parent code.
root_nodes: Dict[str | None, List[Dict[str, Any]]] = {}
# This loop attaches every node either to its parent category or to the root collection when no suitable parent exists.
for node, parent_code in option_nodes:
# Falls ein Parent existiert und wir ihn als Group-Node erfasst haben:
# This branch attaches a node to its parent when the parent exists and has been recognised as a category node.
if parent_code and parent_code in category_nodes:
category_nodes[parent_code]["items"].append(node)
# This branch stores nodes without a valid parent as top‑level entries under their parent code.
else:
# Top-Level-Element (keine passende Gruppe gefunden)
root_nodes.setdefault(parent_code, []).append(node)
# Leere items-Arrays entfernen
# This loop removes empty “items” lists from category nodes so the client does not receive useless empty arrays.
for n in category_nodes.values():
if not n["items"]:
n.pop("items", None)
# --------------------------------------------------------------------- #
# Endgültige Options-Liste formen
# --------------------------------------------------------------------- #
# This comprehension builds the final list of option groups ready to be embedded into the JSON response.
options: List[Dict[str, Any]] = [
{"providercode": parent_code, "items": items}
for parent_code, items in root_nodes.items()
]
# --------------------------------------------------------------------- #
# Basis-Felder + Details zusammenführen
# --------------------------------------------------------------------- #
# This dictionary gathers the base table column values ready for JSON serialization.
base_data: Dict[str, Any] = {
"id": base_record.id_base,
"provider": base_record.provider_base,
......@@ -168,60 +130,43 @@ def _build_base_response(session: Session, base_id: int) -> Dict[str, Any] | Non
"alias": base_record.alias_base,
"flyerurl": base_record.flyerurl_base,
"piburl": base_record.piburl_base,
"created": (
base_record.created_base.isoformat()
if base_record.created_base else None
),
"updated": (
base_record.updated_base.isoformat()
if base_record.updated_base else None
),
"created": base_record.created_base.isoformat() if base_record.created_base else None,
"updated": base_record.updated_base.isoformat() if base_record.updated_base else None,
}
# This expression copies the JSON details column or uses an empty dictionary when the column is NULL.
details_data: Dict[str, Any] = (
base_record.details_base.copy()
if base_record.details_base else {}
)
# This line removes the internal tariff_name helper key so it can be returned as a dedicated attribute.
ai_identified_name = details_data.pop("tariff_name", None)
# This dictionary merges the core base data with the JSON details and the AI‑identified name.
merged_base = {
**base_data,
**details_data,
"ai_identified_name": ai_identified_name,
}
# --------------------------------------------------------------------- #
# Gesamtergebnis
# --------------------------------------------------------------------- #
# The function returns the assembled base, deal, and option data so the caller can serialize it to JSON.
return {
"base": merged_base,
"deals": deals,
"options": options,
}
# --------------------------------------------------------------------------- #
# Routen
# --------------------------------------------------------------------------- #
# This route returns an overview of every base tariff that currently has at
# least one active deal (stops_deal IS NULL).
@blueprint.route("/base", methods=["GET"])
def base_overview():
"""
Übersicht aller aktiven Basis-Tarife.
SQL-Äquivalent:
SELECT id_base AS id,
provider_base AS provider,
providercode_base AS providercode,
name_base AS name,
alias_base AS alias
FROM base_base
INNER JOIN deal_deal ON base_deal = id_base
WHERE stops_deal IS NULL
GROUP BY id_base
ORDER BY provider_base ASC;
"""
# A new database session is opened through the MySQL manager.
session = MysqlManager().getSession()
try:
# This query selects the distinct base objects that have at least one active deal.
query = (
session.query(
BaseBase.id_base.label("id"),
......@@ -236,6 +181,7 @@ def base_overview():
.order_by(BaseBase.provider_base.asc())
)
# This comprehension converts every result row into a plain dictionary ready for JSON serialization.
records: List[Dict[str, Any]] = [
{
"id": row.id,
......@@ -246,25 +192,32 @@ def base_overview():
}
for row in query.all()
]
# The finally block guarantees that the session is always closed.
finally:
session.close()
# The route returns the list of base objects as a JSON array.
return jsonify(records)
# This route returns a complete JSON structure for a single base object or raises a 404 error when the id does not exist.
@blueprint.route("/base/<int:id>", methods=["GET"])
def base_details(id: int):
"""
Detail-Route – liefert die vollständige JSON-Struktur für das
angegebene Base-Objekt (inkl. Deals & Options).
"""
# A new database session is opened through the MySQL manager.
session = MysqlManager().getSession()
try:
# The helper function assembles the complete response structure or returns None when the id is unknown.
data = _build_base_response(session, id)
# The finally block guarantees that the session is always closed.
finally:
session.close()
if data is None: # nicht gefunden ⇒ 404
# This conditional branch aborts with a 404 status when the requested base id was not found.
if data is None:
abort(404, description=f"Base object with id={id} not found.")
# The route returns the fully assembled JSON structure for the requested base id.
return jsonify(data)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from __future__ import annotations
import sys
sys.path.append("..")
import sys; sys.path.append("..")
import time
import hashlib
import io
import json
import os
import tempfile
from typing import List, Tuple
import requests
import urllib3
from flask import Blueprint, Response, request
from sqlalchemy.orm import joinedload
from config.MauiConfig import EECCX_TOKEN_URL, EECCX_API_URL, EECCX_CLIENT_ID, EECCX_CLIENT_SECRET, EECCX_CF_CLIENT_ID, EECCX_CF_CLIENT_SECRET, EECCX_HDL_NR, EECCX_PROV_HDL_NR
# --------------------------------------------------------------------------- #
# Eigene Module #
# --------------------------------------------------------------------------- #
from manager.S3Manager import S3Manager
from manager.MysqlManager import MysqlManager
from models.deal_deal import DealDeal
from models.base_base import BaseBase
from models.option_opti import OptionOpti
from config.MauiConfig import EECCX_TOKEN_URL, EECCX_API_URL, EECCX_CLIENT_ID, EECCX_CLIENT_SECRET, EECCX_CF_CLIENT_ID, EECCX_CF_CLIENT_SECRET, EECCX_HDL_NR, EECCX_PROV_HDL_NR
# --------------------------------------------------------------------------- #
# Warnungen zu unsicheren HTTPS-Requests unterdrücken (nur Dev) #
# --------------------------------------------------------------------------- #
# A warning is disabled so self‑signed certificate usage does not flood the log in development environments.
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# --------------------------------------------------------------------------- #
# Konfiguration / Konstanten #
# --------------------------------------------------------------------------- #
# The constant TOKEN_URL stores the identity provider endpoint used for OAuth authentication.
TOKEN_URL = EECCX_TOKEN_URL
# The constant API_URL stores the partner API endpoint that generates the PDF.
API_URL = EECCX_API_URL
# The constant CLIENT_ID stores the OAuth client identifier for the back‑end application.
CLIENT_ID = EECCX_CLIENT_ID
# The constant CLIENT_SECRET stores the OAuth client secret matching the client identifier.
CLIENT_SECRET = EECCX_CLIENT_SECRET
# The constant CF_CLIENT_ID stores the Cloudflare Access client identifier required by the partner API.
CF_CLIENT_ID = EECCX_CF_CLIENT_ID
# The constant CF_CLIENT_SECRET stores the Cloudflare Access client secret required by the partner API.
CF_CLIENT_SECRET = EECCX_CF_CLIENT_SECRET
# The constant HDL_NR stores the dealer number that must be supplied in every partner API request.
HDL_NR = EECCX_HDL_NR
# The constant PROV_HDL_NR stores the provisioning dealer number that must be supplied in every partner API request.
PROV_HDL_NR = EECCX_PROV_HDL_NR
# The constant PRODUKT_KATEGORIE stores the fixed product category for the request payload.
PRODUKT_KATEGORIE = "O"
# --------------------------------------------------------------------------- #
# Blueprint #
# --------------------------------------------------------------------------- #
# The blueprint instance is created from the module name so the router can be registered once inside the WebManager.
blueprint = Blueprint(__name__.rsplit(".", 1)[-1], __name__)
# S3-Manager für Uploads
# The S3Manager instance manages file uploads to the configured S3 bucket.
s3_manager = S3Manager()
# --------------------------------------------------------------------------- #
# Hilfsfunktionen #
# --------------------------------------------------------------------------- #
# The function returns a JSON error response with the supplied message and status code.
def _json_error(message: str, status_code: int = 500) -> Response:
# A JSON payload is assembled with status "ERROR" and the provided message.
payload = json.dumps({"status": "ERROR", "message": message}, ensure_ascii=False)
return Response(payload, status=status_code, mimetype="application/json")
# The function extracts the option identifiers from the query string and normalises comma‑separated values into a list.
def _extract_options() -> List[str]:
# The raw option list is read from the query argument list to handle multiple "options" parameters.
raw = request.args.getlist("options")
# The following conditional branch handles the special case in which a single comma‑separated value has been supplied instead of repeated parameters.
if len(raw) == 1 and "," in raw[0]:
# The value is split on commas and stripped so empty strings are removed from the final list.
return [opt.strip() for opt in raw[0].split(",") if opt.strip()]
return [opt for opt in raw if opt]
# The function builds a unique hash from tarif_id, the option list, and the current timestamp so the uploaded PDF file name is collision‑free.
def _hash_id_options(tarif_id: str, options: List[str]) -> str:
# The key string concatenates tarif_id, the sorted option list, and the current Unix timestamp.
key = f"{tarif_id}:{','.join(sorted(options))}:{int(time.time())}".encode("utf-8")
return hashlib.sha256(key).hexdigest()
# ---------------------------- OAuth-Token ---------------------------------- #
# The function exchanges client credentials for an OAuth access token and returns the token together with an error message if something goes wrong.
def _get_token() -> Tuple[str | None, str | None]:
# The payload dictionary contains the grant type and client credentials for the token request.
payload = {
"grant_type": "client_credentials",
"client_id": CLIENT_ID,
"client_secret": CLIENT_SECRET,
}
try:
# A POST request is sent to the identity provider to obtain an access token.
r = requests.post(TOKEN_URL, data=payload, verify=False, timeout=10)
r.raise_for_status()
except requests.exceptions.RequestException as exc:
# An error tuple is returned when the HTTP request fails.
return None, f"Token-Abruf fehlgeschlagen: {exc}"
# The JSON response is parsed to extract the access token field.
token = r.json().get("access_token")
# The following conditional branch returns an error tuple when the response does not contain an access token.
if not token:
return None, "Kein access_token im Token-Response."
return token, None
return token, None
# ------------------------- Partner-API-Aufruf ------------------------------ #
# The function assembles the partner API payload, performs the HTTP PUT request, and returns either the JSON result or an error message.
def _partner_api(token: str, tarif_id: str, options: List[str]) -> Tuple[dict | None, str | None]:
# The headers dictionary includes OAuth, Cloudflare, and content‑type information required by the partner API.
headers = {
"Authorization": f"Bearer {token}",
"CF-Access-Client-Id": CF_CLIENT_ID,
......@@ -104,6 +127,8 @@ def _partner_api(token: str, tarif_id: str, options: List[str]) -> Tuple[dict |
}
try:
# A database session is opened to load the requested deal together with its base tariff information.
session = MysqlManager().getSession()
deal_int = int(tarif_id)
deal = (
......@@ -113,28 +138,43 @@ def _partner_api(token: str, tarif_id: str, options: List[str]) -> Tuple[dict |
.filter(DealDeal.id_deal == deal_int)
.one_or_none()
)
# The following conditional branch returns an error whenever the specified deal identifier cannot be found.
if deal is None:
return None, f"Deal {tarif_id} not found."
# The base_obj variable holds the related base tariff for the selected deal.
base_obj: BaseBase = deal.base
# The following conditional branch validates that a provider code exists for the base tariff; otherwise an error is returned.
if not base_obj or not base_obj.providercode_base:
return None, f"Kein providercode_base für Deal id={tarif_id} gefunden."
providercode_base_value = base_obj.providercode_base
providercode_deal_value = deal.providercode_deal
finally:
# The database session is always closed to free resources regardless of success or failure.
session.close()
# The am_aktion_id variable is initialised with None and used only when the deal provider code contains digits.
am_aktion_id: int | None = None
# The following conditional branch extracts numeric characters from the deal provider code to build the am_aktion_id value.
if providercode_deal_value:
digits = "".join(filter(str.isdigit, providercode_deal_value))
if digits:
am_aktion_id = int(digits)
# The service_codes list collects provider codes of selected options and their parent groups to pass them to the API.
service_codes: List[str] = []
try:
# A database session is opened to translate option identifiers to provider codes.
session = MysqlManager().getSession()
for opt_id in options:
# The following try/except converts the option identifier to an integer and skips invalid values.
try:
opt_int = int(opt_id)
except ValueError:
......@@ -144,11 +184,15 @@ def _partner_api(token: str, tarif_id: str, options: List[str]) -> Tuple[dict |
.filter(OptionOpti.id_opti == opt_int)
.one_or_none()
)
# The following conditional branch skips options without a provider code.
if not opt or not opt.providercode_opti:
continue
service_codes.append(opt.providercode_opti)
parent_code = opt.providercategory_opti
# This loop climbs up the category hierarchy so parent group codes are included as service codes.
while parent_code:
parent_opt = (
session.query(OptionOpti)
......@@ -156,14 +200,19 @@ def _partner_api(token: str, tarif_id: str, options: List[str]) -> Tuple[dict |
.limit(1)
.one_or_none()
)
# The following conditional branch breaks the traversal when the parent option is missing or lacks a provider code.
if not parent_opt or not parent_opt.providercode_opti:
break
service_codes.append(parent_opt.providercode_opti)
parent_code = parent_opt.providercategory_opti
finally:
# The database session is always closed to release connections.
session.close()
# The payload dictionary is prepared according to the partner API specification.
payload = {
"hdl_nr": HDL_NR,
"prov_hdl_nr": PROV_HDL_NR,
......@@ -171,21 +220,34 @@ def _partner_api(token: str, tarif_id: str, options: List[str]) -> Tuple[dict |
"produkt_kategorie": PRODUKT_KATEGORIE,
"service_code": service_codes,
}
# The following conditional branch adds the promotional action identifier when one has been detected.
if am_aktion_id is not None:
payload["am_aktion_id"] = am_aktion_id
try:
# A PUT request is sent to the partner API with the assembled payload and headers.
r = requests.put(API_URL, headers=headers, json=payload, verify=False, timeout=30)
r.raise_for_status()
except requests.exceptions.RequestException as exc:
# An error tuple is returned when the HTTP request fails.
return None, f"API-Aufruf fehlgeschlagen: {exc} – Payload: {payload}"
try:
# The JSON body of the HTTP response is parsed.
data = r.json()
except ValueError:
# An error tuple is returned when the response is not valid JSON.
return None, "Antwort der Partner-API ist kein JSON."
# The err_val variable is inspected for API‑level error information that must be mapped to a user‑friendly string.
err_val = data.get("error")
# The following conditional branch returns an error tuple when the API embedded error information in its JSON body.
if err_val:
if isinstance(err_val, list):
msg = "; ".join(str(e) for e in err_val)
......@@ -203,63 +265,68 @@ def _partner_api(token: str, tarif_id: str, options: List[str]) -> Tuple[dict |
return data, None
# ----------------------------- PDF-Download ------------------------------- #
# The function downloads the PDF at the supplied URL and returns its binary content together with an optional error message.
def _download_pdf(url: str) -> Tuple[bytes | None, str | None]:
# A streaming GET request is performed so large files do not exhaust memory unnecessarily.
try:
r = requests.get(url, stream=True, verify=False, timeout=30)
r.raise_for_status()
except requests.exceptions.RequestException as exc:
# A tuple containing None and an error message is returned when the HTTP request fails.
return None, f"PDF-Download fehlgeschlagen: {exc}"
return r.content, None
# --------------------------------------------------------------------------- #
# Route #
# --------------------------------------------------------------------------- #
# The route handler generates the EECCX PDF for a given deal and optional option list and returns the public S3 download URL.
@blueprint.route("/freenet-eeccx/<string:tarif_id>", methods=["GET"])
def eeccx_pdf(tarif_id: str):
"""
Beispiel:
/freenet-eeccx/3877325?options=G343,O3729
/freenet-eeccx/3877325?options=G343&options=B…
"""
# The options list is extracted from the query string so it can be forwarded to the partner API.
options = _extract_options()
# 1) OAuth-Token
# The OAuth token is obtained and an error response is returned when token retrieval fails.
token, err = _get_token()
if err:
return _json_error(err, 502)
# 2) Partner-API
# The partner API is called and an error response is returned when the API invocation fails.
api_json, err = _partner_api(token, tarif_id, options)
if err:
return _json_error(err, 502)
# 3) PDF-URL extrahieren
# The pdf_url variable tries to extract the PCS or PCI PDF link from the partner API response JSON.
pdf_url = api_json.get("pcsPdf") or api_json.get("pciPdf")
# The following conditional branch returns an error response when no PDF URL is present in the API response.
if not pdf_url:
msg = api_json.get("message") or "Keine PDF-URL in der API-Antwort."
return _json_error(msg, 502)
# 4) PDF laden
# The PDF is downloaded and an error response is returned when the download fails.
pdf_bytes, err = _download_pdf(pdf_url)
if err:
return _json_error(err, 502)
# 5) Temporäre Datei zum Upload schreiben
# A unique hash is generated so the temporary file and the S3 object name are collision‑free.
hash_name = _hash_id_options(tarif_id, options)
# A temporary file is opened so the PDF can be written to disk for uploading.
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
tmp.write(pdf_bytes)
tmp_path = tmp.name
# 6) Upload zu S3
# The PDF file is uploaded to S3 and the local temporary file is removed afterwards.
s3_key = f"eeccx/{hash_name}.pdf"
url = s3_manager.uploadFile(tmp_path, s3_key)
os.remove(tmp_path)
# The following conditional branch returns an error response when the S3 upload fails.
if not url:
return _json_error(f"Upload zu S3 fehlgeschlagen für key={s3_key}", 502)
# 7) Download-URL als JSON zurückgeben
# A JSON response is returned containing the public URL of the uploaded PDF.
payload = json.dumps({"url": url}, ensure_ascii=False)
return Response(payload, status=200, mimetype="application/json")
\ No newline at end of file
"""
Health-Check-Router
Kapselt den Endpunkt / für den System-Gesundheitscheck.
"""
from flask import Blueprint, jsonify
# Blueprint-Name = Dateiname ohne Punkte; verhindert ValueError
# The blueprint instance is named after the current module without dots to avoid a ValueError on registration.
blueprint = Blueprint(__name__.rsplit(".", 1)[-1], __name__)
# This function handles HTTP GET requests to the root path and returns a simple health-check response.
@blueprint.route("/", methods=["GET"])
def index():
"""
GET /
Liefert einen einfachen JSON-Status.
"""
return jsonify({"message": "The API is working."})
# The function returns a JSON object indicating that the API is operational.
return jsonify({"message": "The API is working.", "status": "OK"})
No preview for this file type
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment