ge-tool/backend/services/mongodb_service.py

470 lines
18 KiB
Python
Raw Normal View History

2025-12-10 06:41:43 +00:00
"""
Module for MongoDB connection and data access.
Combines logic from the old project's mongodb.py and mongodb_submissions.py
"""
import os
import logging
import re
from datetime import datetime, timedelta
from typing import List, Dict, Optional, Any
from pymongo import MongoClient, ASCENDING, DESCENDING
from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError, DuplicateKeyError
from dotenv import load_dotenv
# Load environment variables from .env.local first, then .env
load_dotenv('.env.local')
load_dotenv() # Fallback to .env
# Use logger from root (configured in main.py)
logger = logging.getLogger(__name__)
# --- MongoDB Config ---
MONGODB_URI = os.getenv("MONGODB_URI")
if not MONGODB_URI:
raise ValueError("MONGODB_URI not found in environment variables")
DATABASE_NAME = "schedule"
SUBMISSIONS_COLLECTION = "submissions"
TITLES_COLLECTION = "titles_data"
# --- Connection Caching ---
_mongodb_client = None
_submissions_collection = None
_titles_collection = None
# ----------------------
# Connection helpers
# ----------------------
def get_db_connection():
"""Initializes and returns the MongoDB database connection with caching."""
global _mongodb_client
if _mongodb_client is None:
try:
logger.debug("Initializing new MongoDB connection...")
_mongodb_client = MongoClient(
MONGODB_URI,
serverSelectionTimeoutMS=5000,
connectTimeoutMS=10000,
socketTimeoutMS=10000
)
# Test connection
_mongodb_client.admin.command('ping')
logger.debug("MongoDB connection successful.")
except (ConnectionFailure, ServerSelectionTimeoutError) as e:
logger.error(f"Could not connect to MongoDB: {e}")
_mongodb_client = None # Reset on failure
raise Exception(f"Không thể kết nối MongoDB: {e}")
except Exception as e:
logger.error(
f"An unexpected error occurred during MongoDB initialization: {e}")
_mongodb_client = None # Reset on failure
raise Exception(f"Lỗi khởi tạo MongoDB: {e}")
return _mongodb_client[DATABASE_NAME]
def get_submissions_collection():
"""Returns the submissions collection, initializing the connection if needed."""
global _submissions_collection
if _submissions_collection is None:
db = get_db_connection()
_submissions_collection = db[SUBMISSIONS_COLLECTION]
_create_submission_indexes()
return _submissions_collection
def get_titles_collection():
"""Returns the titles collection, initializing the connection if needed."""
global _titles_collection
if _titles_collection is None:
db = get_db_connection()
_titles_collection = db[TITLES_COLLECTION]
return _titles_collection
def close_mongodb_connection():
"""Closes the MongoDB connection if it exists."""
global _mongodb_client
if _mongodb_client:
_mongodb_client.close()
_mongodb_client = None
logger.debug("MongoDB connection closed.")
# -------------------------------
# Indexes and initialization
# -------------------------------
def _create_submission_indexes():
"""Creates necessary indexes for the submissions collection."""
try:
collection = get_submissions_collection()
# Unique submission_id
collection.create_index(
"submission_id", unique=True, name="idx_submission_id")
# Timestamp for sorting
collection.create_index(
[("created_at", DESCENDING)], name="idx_created_at")
# Status index
collection.create_index("status", name="idx_status")
# Compound index for queue ordering
collection.create_index(
[("status", ASCENDING), ("queue_position", ASCENDING)], name="idx_queue")
# TTL index - automatically delete submissions after 30 days
collection.create_index(
"created_at", expireAfterSeconds=2592000, name="idx_ttl")
logger.debug("Submission indexes created successfully.")
except Exception as e:
logger.error(f"Error creating submission indexes: {e}")
# ---------------------------------------------------
# Submissions Logic (adapted from mongodb_submissions.py)
# ---------------------------------------------------
def create_submission(submission_id: str, usernames: List[str], ge_input: str) -> Dict[str, Any]:
"""Creates a new submission with 'pending' status and assigns a queue_position."""
try:
collection = get_submissions_collection()
now = datetime.utcnow()
# Determine next queue position among pending
max_doc = collection.find_one({"status": "pending"}, sort=[
("queue_position", DESCENDING)])
next_position = (max_doc.get("queue_position") + 1) if (
max_doc and max_doc.get("queue_position") is not None) else 1
submission_doc = {
"submission_id": submission_id,
"timestamp": now,
"status": "pending",
"input": {
"usernames": usernames,
"ge_input": ge_input
},
"results": [],
"error_message": None,
"created_at": now,
"updated_at": now,
"processing_started_at": None,
"processing_completed_at": None,
"queue_position": next_position,
"retry_count": 0,
"last_retry_at": None
}
result = collection.insert_one(submission_doc)
# Convert ObjectId to string
submission_doc["_id"] = str(result.inserted_id)
logger.debug(
f"Created submission: {submission_id} at position {next_position}")
return submission_doc
except DuplicateKeyError:
raise Exception(f"Submission ID {submission_id} đã tồn tại")
except Exception as e:
logger.error(f"Error creating submission: {e}")
raise Exception(f"Không thể tạo submission: {e}")
def get_submission_by_id(submission_id: str) -> Optional[Dict[str, Any]]:
"""Fetches a submission by its submission_id."""
try:
collection = get_submissions_collection()
doc = collection.find_one({"submission_id": submission_id})
if doc:
doc["_id"] = str(doc["_id"])
return doc
except Exception as e:
logger.error(f"Error fetching submission {submission_id}: {e}")
return None
def get_submissions(limit: int = 50, status: Optional[str] = None) -> List[Dict[str, Any]]:
"""Fetches submissions, optionally filtered by status, newest first."""
try:
collection = get_submissions_collection()
query = {}
if status:
query["status"] = status
cursor = collection.find(query).sort(
"created_at", DESCENDING).limit(limit)
subs = []
for doc in cursor:
doc["_id"] = str(doc["_id"])
subs.append(doc)
return subs
except Exception as e:
logger.error(f"Error fetching submissions: {e}")
return []
def get_pending_submissions() -> List[Dict[str, Any]]:
"""Returns pending submissions ordered by queue_position ascending."""
try:
collection = get_submissions_collection()
cursor = collection.find({"status": "pending"}).sort(
"queue_position", ASCENDING)
subs = []
for doc in cursor:
doc["_id"] = str(doc["_id"])
subs.append(doc)
return subs
except Exception as e:
logger.error(f"Error fetching pending submissions: {e}")
return []
def get_next_pending_submission() -> Optional[Dict[str, Any]]:
"""Return the next pending submission (lowest queue_position)."""
try:
collection = get_submissions_collection()
doc = collection.find_one({"status": "pending"}, sort=[
("queue_position", ASCENDING)])
if doc:
doc["_id"] = str(doc["_id"])
return doc
except Exception as e:
logger.error(f"Error fetching next pending submission: {e}")
return None
def update_submission(
submission_id: str,
status: str,
results: Optional[List[Dict]] = None,
error_message: Optional[str] = None
) -> bool:
"""Updates the status and results of a submission and manages timestamps/queue position."""
try:
collection = get_submissions_collection()
update_data = {
"status": status,
"updated_at": datetime.utcnow()
}
if status == "processing":
update_data["processing_started_at"] = datetime.utcnow()
elif status in ["completed", "failed"]:
update_data["processing_completed_at"] = datetime.utcnow()
update_data["queue_position"] = None
if status == "completed" and results is not None:
update_data["results"] = results
if status == "failed" and error_message is not None:
update_data["error_message"] = error_message
result = collection.update_one(
{"submission_id": submission_id}, {"$set": update_data})
if result.modified_count > 0:
logger.debug(
f"Updated submission {submission_id} to status {status}")
return True
else:
logger.warning(f"No submission found with ID: {submission_id}")
return False
except Exception as e:
logger.error(f"Error updating submission {submission_id}: {e}")
return False
def delete_submission(submission_id: str) -> bool:
"""Deletes a submission by its ID."""
try:
collection = get_submissions_collection()
result = collection.delete_one({"submission_id": submission_id})
if result.deleted_count > 0:
logger.debug(f"Deleted submission: {submission_id}")
return True
else:
logger.warning(f"No submission found with ID: {submission_id}")
return False
except Exception as e:
logger.error(f"Error deleting submission {submission_id}: {e}")
return False
def increment_retry_count(submission_id: str) -> bool:
"""Increment retry_count and set last_retry_at/updated_at."""
try:
collection = get_submissions_collection()
result = collection.update_one(
{"submission_id": submission_id},
{
"$inc": {"retry_count": 1},
"$set": {"last_retry_at": datetime.utcnow(), "updated_at": datetime.utcnow()}
}
)
return result.modified_count > 0
except Exception as e:
logger.error(f"Error increment retry count for {submission_id}: {e}")
return False
def requeue_stuck_submissions(timeout_minutes: int = 30) -> int:
"""Requeue submissions stuck in processing longer than timeout_minutes back to pending."""
try:
collection = get_submissions_collection()
timeout_date = datetime.utcnow() - timedelta(minutes=timeout_minutes)
result = collection.update_many(
{"status": "processing", "processing_started_at": {"$lt": timeout_date}},
{"$set": {"status": "pending", "updated_at": datetime.utcnow(
), "processing_started_at": None}, "$inc": {"retry_count": 1}}
)
logger.debug(f"Requeued {result.modified_count} stuck submissions")
return result.modified_count
except Exception as e:
logger.error(f"Error requeue stuck submissions: {e}")
return 0
def cleanup_excess_submissions(max_keep: int = 15) -> int:
"""Keep only the newest `max_keep` completed/failed submissions; delete older ones."""
try:
collection = get_submissions_collection()
count = collection.count_documents(
{"status": {"$in": ["completed", "failed"]}})
if count <= max_keep:
logger.debug(
f"Current completed/failed count ({count}) <= max_keep ({max_keep}), nothing to cleanup")
return 0
to_delete = count - max_keep
old_docs = list(collection.find({"status": {"$in": ["completed", "failed"]}}, {
"_id": 1}).sort("created_at", ASCENDING).limit(to_delete))
if not old_docs:
return 0
ids = [d["_id"] for d in old_docs]
result = collection.delete_many({"_id": {"$in": ids}})
logger.debug(f"Cleaned up {result.deleted_count} excess submissions")
return result.deleted_count
except Exception as e:
logger.error(f"Error cleanup excess submissions: {e}")
return 0
def cleanup_old_submissions(days: int = 30) -> int:
"""Delete completed/failed submissions older than `days` days."""
try:
collection = get_submissions_collection()
cutoff = datetime.utcnow() - timedelta(days=days)
result = collection.delete_many(
{"created_at": {"$lt": cutoff}, "status": {"$in": ["completed", "failed"]}})
logger.debug(f"Cleaned up {result.deleted_count} old submissions")
return result.deleted_count
except Exception as e:
logger.error(f"Error cleanup old submissions: {e}")
return 0
def get_statistics() -> Dict[str, int]:
"""Return counts grouped by status and total."""
try:
collection = get_submissions_collection()
pipeline = [{"$group": {"_id": "$status", "count": {"$sum": 1}}}]
results = list(collection.aggregate(pipeline))
stats = {"total": 0, "pending": 0,
"processing": 0, "completed": 0, "failed": 0}
for item in results:
status = item.get("_id")
count = item.get("count", 0)
if status in stats:
stats[status] = count
stats["total"] += count
return stats
except Exception as e:
logger.error(f"Error getting statistics: {e}")
return {"total": 0, "pending": 0, "processing": 0, "completed": 0, "failed": 0}
# ---------------------------------------------------
# Titles Logic (from mongodb.py)
# ---------------------------------------------------
# Note: This part is not directly used by the permission page,
# but it's good to have it here for future use.
def get_tms_data(ge_id: str, orig_lang: str) -> Optional[str]:
"""
Fetches the TMS ID from the titles_data collection.
Returns the TMS ID as a string or None if not found.
"""
try:
collection = get_titles_collection()
query = {"geId": str(ge_id).strip(), "lang": str(
orig_lang).strip().upper()}
document = collection.find_one(query)
if not document:
logger.warning(
f"No document found for geId: {ge_id}, lang: {orig_lang}")
return None
# Try extract from trTmsLink first
tms_link = document.get("trTmsLink")
if tms_link and isinstance(tms_link, str):
match = re.search(r'/project/(\d+)', tms_link)
if match:
return match.group(1)
tms_id_direct = document.get("tmsId")
if tms_id_direct:
return str(tms_id_direct).strip()
return None
except Exception as e:
logger.error(f"Error querying MongoDB for TMS data: {e}")
return None
def get_path_from_tms_data(ge_id: str, orig_lang: str) -> Optional[str]:
"""
Fetches the NAS path from the titles_data collection for raw file downloads.
Uses the same query logic as get_tms_data but returns the 'path' field.
Returns the path as a string or None if not found.
"""
try:
collection = get_titles_collection()
query = {"geId": str(ge_id).strip(), "lang": str(
orig_lang).strip().upper()}
document = collection.find_one(query)
if not document:
logger.warning(
f"No document found for geId: {ge_id}, lang: {orig_lang}")
return None
# Get the path field directly
path = document.get("path")
if path and isinstance(path, str):
return str(path).strip()
logger.warning(
f"No path field found for geId: {ge_id}, lang: {orig_lang}")
return None
except Exception as e:
logger.error(f"Error querying MongoDB for path data: {e}")
return None
def get_sharing_link_from_tms_data(ge_id: str, orig_lang: str) -> Optional[str]:
"""
Fetches the sharing link (linkRaw) from the titles_data collection.
Used for displaying source in sharing mode downloads.
Returns the linkRaw as a string or None if not found.
"""
try:
collection = get_titles_collection()
query = {"geId": str(ge_id).strip(), "lang": str(
orig_lang).strip().upper()}
document = collection.find_one(query)
if not document:
logger.warning(
f"No document found for geId: {ge_id}, lang: {orig_lang}")
return None
# Get the linkRaw field
link_raw = document.get("linkRaw")
if link_raw and isinstance(link_raw, str):
return str(link_raw).strip()
logger.warning(
f"No linkRaw field found for geId: {ge_id}, lang: {orig_lang}")
return None
except Exception as e:
logger.error(f"Error querying MongoDB for linkRaw data: {e}")
return None