""" Module for MongoDB connection and data access. Combines logic from the old project's mongodb.py and mongodb_submissions.py """ import os import logging import re from datetime import datetime, timedelta from typing import List, Dict, Optional, Any from pymongo import MongoClient, ASCENDING, DESCENDING from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError, DuplicateKeyError from dotenv import load_dotenv # Load environment variables from .env.local first, then .env load_dotenv('.env.local') load_dotenv() # Fallback to .env # Use logger from root (configured in main.py) logger = logging.getLogger(__name__) # --- MongoDB Config --- MONGODB_URI = os.getenv("MONGODB_URI") if not MONGODB_URI: raise ValueError("MONGODB_URI not found in environment variables") DATABASE_NAME = "schedule" SUBMISSIONS_COLLECTION = "submissions" TITLES_COLLECTION = "titles_data" # --- Connection Caching --- _mongodb_client = None _submissions_collection = None _titles_collection = None # ---------------------- # Connection helpers # ---------------------- def get_db_connection(): """Initializes and returns the MongoDB database connection with caching.""" global _mongodb_client if _mongodb_client is None: try: logger.debug("Initializing new MongoDB connection...") _mongodb_client = MongoClient( MONGODB_URI, serverSelectionTimeoutMS=5000, connectTimeoutMS=10000, socketTimeoutMS=10000 ) # Test connection _mongodb_client.admin.command('ping') logger.debug("MongoDB connection successful.") except (ConnectionFailure, ServerSelectionTimeoutError) as e: logger.error(f"Could not connect to MongoDB: {e}") _mongodb_client = None # Reset on failure raise Exception(f"Không thể kết nối MongoDB: {e}") except Exception as e: logger.error( f"An unexpected error occurred during MongoDB initialization: {e}") _mongodb_client = None # Reset on failure raise Exception(f"Lỗi khởi tạo MongoDB: {e}") return _mongodb_client[DATABASE_NAME] def get_submissions_collection(): """Returns the submissions collection, initializing the connection if needed.""" global _submissions_collection if _submissions_collection is None: db = get_db_connection() _submissions_collection = db[SUBMISSIONS_COLLECTION] _create_submission_indexes() return _submissions_collection def get_titles_collection(): """Returns the titles collection, initializing the connection if needed.""" global _titles_collection if _titles_collection is None: db = get_db_connection() _titles_collection = db[TITLES_COLLECTION] return _titles_collection def close_mongodb_connection(): """Closes the MongoDB connection if it exists.""" global _mongodb_client if _mongodb_client: _mongodb_client.close() _mongodb_client = None logger.debug("MongoDB connection closed.") # ------------------------------- # Indexes and initialization # ------------------------------- def _create_submission_indexes(): """Creates necessary indexes for the submissions collection.""" try: collection = get_submissions_collection() # Unique submission_id collection.create_index( "submission_id", unique=True, name="idx_submission_id") # Timestamp for sorting collection.create_index( [("created_at", DESCENDING)], name="idx_created_at") # Status index collection.create_index("status", name="idx_status") # Compound index for queue ordering collection.create_index( [("status", ASCENDING), ("queue_position", ASCENDING)], name="idx_queue") # TTL index - automatically delete submissions after 30 days collection.create_index( "created_at", expireAfterSeconds=2592000, name="idx_ttl") logger.debug("Submission indexes created successfully.") except Exception as e: logger.error(f"Error creating submission indexes: {e}") # --------------------------------------------------- # Submissions Logic (adapted from mongodb_submissions.py) # --------------------------------------------------- def create_submission(submission_id: str, usernames: List[str], ge_input: str) -> Dict[str, Any]: """Creates a new submission with 'pending' status and assigns a queue_position.""" try: collection = get_submissions_collection() now = datetime.utcnow() # Determine next queue position among pending max_doc = collection.find_one({"status": "pending"}, sort=[ ("queue_position", DESCENDING)]) next_position = (max_doc.get("queue_position") + 1) if ( max_doc and max_doc.get("queue_position") is not None) else 1 submission_doc = { "submission_id": submission_id, "timestamp": now, "status": "pending", "input": { "usernames": usernames, "ge_input": ge_input }, "results": [], "error_message": None, "created_at": now, "updated_at": now, "processing_started_at": None, "processing_completed_at": None, "queue_position": next_position, "retry_count": 0, "last_retry_at": None } result = collection.insert_one(submission_doc) # Convert ObjectId to string submission_doc["_id"] = str(result.inserted_id) logger.debug( f"Created submission: {submission_id} at position {next_position}") return submission_doc except DuplicateKeyError: raise Exception(f"Submission ID {submission_id} đã tồn tại") except Exception as e: logger.error(f"Error creating submission: {e}") raise Exception(f"Không thể tạo submission: {e}") def get_submission_by_id(submission_id: str) -> Optional[Dict[str, Any]]: """Fetches a submission by its submission_id.""" try: collection = get_submissions_collection() doc = collection.find_one({"submission_id": submission_id}) if doc: doc["_id"] = str(doc["_id"]) return doc except Exception as e: logger.error(f"Error fetching submission {submission_id}: {e}") return None def get_submissions(limit: int = 50, status: Optional[str] = None) -> List[Dict[str, Any]]: """Fetches submissions, optionally filtered by status, newest first.""" try: collection = get_submissions_collection() query = {} if status: query["status"] = status cursor = collection.find(query).sort( "created_at", DESCENDING).limit(limit) subs = [] for doc in cursor: doc["_id"] = str(doc["_id"]) subs.append(doc) return subs except Exception as e: logger.error(f"Error fetching submissions: {e}") return [] def get_pending_submissions() -> List[Dict[str, Any]]: """Returns pending submissions ordered by queue_position ascending.""" try: collection = get_submissions_collection() cursor = collection.find({"status": "pending"}).sort( "queue_position", ASCENDING) subs = [] for doc in cursor: doc["_id"] = str(doc["_id"]) subs.append(doc) return subs except Exception as e: logger.error(f"Error fetching pending submissions: {e}") return [] def get_next_pending_submission() -> Optional[Dict[str, Any]]: """Return the next pending submission (lowest queue_position).""" try: collection = get_submissions_collection() doc = collection.find_one({"status": "pending"}, sort=[ ("queue_position", ASCENDING)]) if doc: doc["_id"] = str(doc["_id"]) return doc except Exception as e: logger.error(f"Error fetching next pending submission: {e}") return None def update_submission( submission_id: str, status: str, results: Optional[List[Dict]] = None, error_message: Optional[str] = None ) -> bool: """Updates the status and results of a submission and manages timestamps/queue position.""" try: collection = get_submissions_collection() update_data = { "status": status, "updated_at": datetime.utcnow() } if status == "processing": update_data["processing_started_at"] = datetime.utcnow() elif status in ["completed", "failed"]: update_data["processing_completed_at"] = datetime.utcnow() update_data["queue_position"] = None if status == "completed" and results is not None: update_data["results"] = results if status == "failed" and error_message is not None: update_data["error_message"] = error_message result = collection.update_one( {"submission_id": submission_id}, {"$set": update_data}) if result.modified_count > 0: logger.debug( f"Updated submission {submission_id} to status {status}") return True else: logger.warning(f"No submission found with ID: {submission_id}") return False except Exception as e: logger.error(f"Error updating submission {submission_id}: {e}") return False def delete_submission(submission_id: str) -> bool: """Deletes a submission by its ID.""" try: collection = get_submissions_collection() result = collection.delete_one({"submission_id": submission_id}) if result.deleted_count > 0: logger.debug(f"Deleted submission: {submission_id}") return True else: logger.warning(f"No submission found with ID: {submission_id}") return False except Exception as e: logger.error(f"Error deleting submission {submission_id}: {e}") return False def increment_retry_count(submission_id: str) -> bool: """Increment retry_count and set last_retry_at/updated_at.""" try: collection = get_submissions_collection() result = collection.update_one( {"submission_id": submission_id}, { "$inc": {"retry_count": 1}, "$set": {"last_retry_at": datetime.utcnow(), "updated_at": datetime.utcnow()} } ) return result.modified_count > 0 except Exception as e: logger.error(f"Error increment retry count for {submission_id}: {e}") return False def requeue_stuck_submissions(timeout_minutes: int = 30) -> int: """Requeue submissions stuck in processing longer than timeout_minutes back to pending.""" try: collection = get_submissions_collection() timeout_date = datetime.utcnow() - timedelta(minutes=timeout_minutes) result = collection.update_many( {"status": "processing", "processing_started_at": {"$lt": timeout_date}}, {"$set": {"status": "pending", "updated_at": datetime.utcnow( ), "processing_started_at": None}, "$inc": {"retry_count": 1}} ) logger.debug(f"Requeued {result.modified_count} stuck submissions") return result.modified_count except Exception as e: logger.error(f"Error requeue stuck submissions: {e}") return 0 def cleanup_excess_submissions(max_keep: int = 15) -> int: """Keep only the newest `max_keep` completed/failed submissions; delete older ones.""" try: collection = get_submissions_collection() count = collection.count_documents( {"status": {"$in": ["completed", "failed"]}}) if count <= max_keep: logger.debug( f"Current completed/failed count ({count}) <= max_keep ({max_keep}), nothing to cleanup") return 0 to_delete = count - max_keep old_docs = list(collection.find({"status": {"$in": ["completed", "failed"]}}, { "_id": 1}).sort("created_at", ASCENDING).limit(to_delete)) if not old_docs: return 0 ids = [d["_id"] for d in old_docs] result = collection.delete_many({"_id": {"$in": ids}}) logger.debug(f"Cleaned up {result.deleted_count} excess submissions") return result.deleted_count except Exception as e: logger.error(f"Error cleanup excess submissions: {e}") return 0 def cleanup_old_submissions(days: int = 30) -> int: """Delete completed/failed submissions older than `days` days.""" try: collection = get_submissions_collection() cutoff = datetime.utcnow() - timedelta(days=days) result = collection.delete_many( {"created_at": {"$lt": cutoff}, "status": {"$in": ["completed", "failed"]}}) logger.debug(f"Cleaned up {result.deleted_count} old submissions") return result.deleted_count except Exception as e: logger.error(f"Error cleanup old submissions: {e}") return 0 def get_statistics() -> Dict[str, int]: """Return counts grouped by status and total.""" try: collection = get_submissions_collection() pipeline = [{"$group": {"_id": "$status", "count": {"$sum": 1}}}] results = list(collection.aggregate(pipeline)) stats = {"total": 0, "pending": 0, "processing": 0, "completed": 0, "failed": 0} for item in results: status = item.get("_id") count = item.get("count", 0) if status in stats: stats[status] = count stats["total"] += count return stats except Exception as e: logger.error(f"Error getting statistics: {e}") return {"total": 0, "pending": 0, "processing": 0, "completed": 0, "failed": 0} # --------------------------------------------------- # Titles Logic (from mongodb.py) # --------------------------------------------------- # Note: This part is not directly used by the permission page, # but it's good to have it here for future use. def get_tms_data(ge_id: str, orig_lang: str) -> Optional[str]: """ Fetches the TMS ID from the titles_data collection. Returns the TMS ID as a string or None if not found. """ try: collection = get_titles_collection() query = {"geId": str(ge_id).strip(), "lang": str( orig_lang).strip().upper()} document = collection.find_one(query) if not document: logger.warning( f"No document found for geId: {ge_id}, lang: {orig_lang}") return None # Try extract from trTmsLink first tms_link = document.get("trTmsLink") if tms_link and isinstance(tms_link, str): match = re.search(r'/project/(\d+)', tms_link) if match: return match.group(1) tms_id_direct = document.get("tmsId") if tms_id_direct: return str(tms_id_direct).strip() return None except Exception as e: logger.error(f"Error querying MongoDB for TMS data: {e}") return None def get_path_from_tms_data(ge_id: str, orig_lang: str) -> Optional[str]: """ Fetches the NAS path from the titles_data collection for raw file downloads. Uses the same query logic as get_tms_data but returns the 'path' field. Returns the path as a string or None if not found. """ try: collection = get_titles_collection() query = {"geId": str(ge_id).strip(), "lang": str( orig_lang).strip().upper()} document = collection.find_one(query) if not document: logger.warning( f"No document found for geId: {ge_id}, lang: {orig_lang}") return None # Get the path field directly path = document.get("path") if path and isinstance(path, str): return str(path).strip() logger.warning( f"No path field found for geId: {ge_id}, lang: {orig_lang}") return None except Exception as e: logger.error(f"Error querying MongoDB for path data: {e}") return None def get_sharing_link_from_tms_data(ge_id: str, orig_lang: str) -> Optional[str]: """ Fetches the sharing link (linkRaw) from the titles_data collection. Used for displaying source in sharing mode downloads. Returns the linkRaw as a string or None if not found. """ try: collection = get_titles_collection() query = {"geId": str(ge_id).strip(), "lang": str( orig_lang).strip().upper()} document = collection.find_one(query) if not document: logger.warning( f"No document found for geId: {ge_id}, lang: {orig_lang}") return None # Get the linkRaw field link_raw = document.get("linkRaw") if link_raw and isinstance(link_raw, str): return str(link_raw).strip() logger.warning( f"No linkRaw field found for geId: {ge_id}, lang: {orig_lang}") return None except Exception as e: logger.error(f"Error querying MongoDB for linkRaw data: {e}") return None