470 lines
18 KiB
Python
Executable File
470 lines
18 KiB
Python
Executable File
"""
|
|
Module for MongoDB connection and data access.
|
|
Combines logic from the old project's mongodb.py and mongodb_submissions.py
|
|
"""
|
|
|
|
import os
|
|
import logging
|
|
import re
|
|
from datetime import datetime, timedelta
|
|
from typing import List, Dict, Optional, Any
|
|
from pymongo import MongoClient, ASCENDING, DESCENDING
|
|
from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError, DuplicateKeyError
|
|
from dotenv import load_dotenv
|
|
|
|
# Load environment variables from .env.local first, then .env
|
|
load_dotenv('.env.local')
|
|
load_dotenv() # Fallback to .env
|
|
|
|
# Use logger from root (configured in main.py)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# --- MongoDB Config ---
|
|
MONGODB_URI = os.getenv("MONGODB_URI")
|
|
if not MONGODB_URI:
|
|
raise ValueError("MONGODB_URI not found in environment variables")
|
|
|
|
DATABASE_NAME = "schedule"
|
|
SUBMISSIONS_COLLECTION = "submissions"
|
|
TITLES_COLLECTION = "titles_data"
|
|
|
|
# --- Connection Caching ---
|
|
_mongodb_client = None
|
|
_submissions_collection = None
|
|
_titles_collection = None
|
|
|
|
# ----------------------
|
|
# Connection helpers
|
|
# ----------------------
|
|
|
|
|
|
def get_db_connection():
|
|
"""Initializes and returns the MongoDB database connection with caching."""
|
|
global _mongodb_client
|
|
if _mongodb_client is None:
|
|
try:
|
|
logger.debug("Initializing new MongoDB connection...")
|
|
_mongodb_client = MongoClient(
|
|
MONGODB_URI,
|
|
serverSelectionTimeoutMS=5000,
|
|
connectTimeoutMS=10000,
|
|
socketTimeoutMS=10000
|
|
)
|
|
# Test connection
|
|
_mongodb_client.admin.command('ping')
|
|
logger.debug("MongoDB connection successful.")
|
|
except (ConnectionFailure, ServerSelectionTimeoutError) as e:
|
|
logger.error(f"Could not connect to MongoDB: {e}")
|
|
_mongodb_client = None # Reset on failure
|
|
raise Exception(f"Không thể kết nối MongoDB: {e}")
|
|
except Exception as e:
|
|
logger.error(
|
|
f"An unexpected error occurred during MongoDB initialization: {e}")
|
|
_mongodb_client = None # Reset on failure
|
|
raise Exception(f"Lỗi khởi tạo MongoDB: {e}")
|
|
|
|
return _mongodb_client[DATABASE_NAME]
|
|
|
|
|
|
def get_submissions_collection():
|
|
"""Returns the submissions collection, initializing the connection if needed."""
|
|
global _submissions_collection
|
|
if _submissions_collection is None:
|
|
db = get_db_connection()
|
|
_submissions_collection = db[SUBMISSIONS_COLLECTION]
|
|
_create_submission_indexes()
|
|
return _submissions_collection
|
|
|
|
|
|
def get_titles_collection():
|
|
"""Returns the titles collection, initializing the connection if needed."""
|
|
global _titles_collection
|
|
if _titles_collection is None:
|
|
db = get_db_connection()
|
|
_titles_collection = db[TITLES_COLLECTION]
|
|
return _titles_collection
|
|
|
|
|
|
def close_mongodb_connection():
|
|
"""Closes the MongoDB connection if it exists."""
|
|
global _mongodb_client
|
|
if _mongodb_client:
|
|
_mongodb_client.close()
|
|
_mongodb_client = None
|
|
logger.debug("MongoDB connection closed.")
|
|
|
|
# -------------------------------
|
|
# Indexes and initialization
|
|
# -------------------------------
|
|
|
|
|
|
def _create_submission_indexes():
|
|
"""Creates necessary indexes for the submissions collection."""
|
|
try:
|
|
collection = get_submissions_collection()
|
|
# Unique submission_id
|
|
collection.create_index(
|
|
"submission_id", unique=True, name="idx_submission_id")
|
|
# Timestamp for sorting
|
|
collection.create_index(
|
|
[("created_at", DESCENDING)], name="idx_created_at")
|
|
# Status index
|
|
collection.create_index("status", name="idx_status")
|
|
# Compound index for queue ordering
|
|
collection.create_index(
|
|
[("status", ASCENDING), ("queue_position", ASCENDING)], name="idx_queue")
|
|
# TTL index - automatically delete submissions after 30 days
|
|
collection.create_index(
|
|
"created_at", expireAfterSeconds=2592000, name="idx_ttl")
|
|
logger.debug("Submission indexes created successfully.")
|
|
except Exception as e:
|
|
logger.error(f"Error creating submission indexes: {e}")
|
|
|
|
# ---------------------------------------------------
|
|
# Submissions Logic (adapted from mongodb_submissions.py)
|
|
# ---------------------------------------------------
|
|
|
|
|
|
def create_submission(submission_id: str, usernames: List[str], ge_input: str) -> Dict[str, Any]:
|
|
"""Creates a new submission with 'pending' status and assigns a queue_position."""
|
|
try:
|
|
collection = get_submissions_collection()
|
|
now = datetime.utcnow()
|
|
# Determine next queue position among pending
|
|
max_doc = collection.find_one({"status": "pending"}, sort=[
|
|
("queue_position", DESCENDING)])
|
|
next_position = (max_doc.get("queue_position") + 1) if (
|
|
max_doc and max_doc.get("queue_position") is not None) else 1
|
|
|
|
submission_doc = {
|
|
"submission_id": submission_id,
|
|
"timestamp": now,
|
|
"status": "pending",
|
|
"input": {
|
|
"usernames": usernames,
|
|
"ge_input": ge_input
|
|
},
|
|
"results": [],
|
|
"error_message": None,
|
|
"created_at": now,
|
|
"updated_at": now,
|
|
"processing_started_at": None,
|
|
"processing_completed_at": None,
|
|
"queue_position": next_position,
|
|
"retry_count": 0,
|
|
"last_retry_at": None
|
|
}
|
|
|
|
result = collection.insert_one(submission_doc)
|
|
# Convert ObjectId to string
|
|
submission_doc["_id"] = str(result.inserted_id)
|
|
logger.debug(
|
|
f"Created submission: {submission_id} at position {next_position}")
|
|
return submission_doc
|
|
except DuplicateKeyError:
|
|
raise Exception(f"Submission ID {submission_id} đã tồn tại")
|
|
except Exception as e:
|
|
logger.error(f"Error creating submission: {e}")
|
|
raise Exception(f"Không thể tạo submission: {e}")
|
|
|
|
|
|
def get_submission_by_id(submission_id: str) -> Optional[Dict[str, Any]]:
|
|
"""Fetches a submission by its submission_id."""
|
|
try:
|
|
collection = get_submissions_collection()
|
|
doc = collection.find_one({"submission_id": submission_id})
|
|
if doc:
|
|
doc["_id"] = str(doc["_id"])
|
|
return doc
|
|
except Exception as e:
|
|
logger.error(f"Error fetching submission {submission_id}: {e}")
|
|
return None
|
|
|
|
|
|
def get_submissions(limit: int = 50, status: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
"""Fetches submissions, optionally filtered by status, newest first."""
|
|
try:
|
|
collection = get_submissions_collection()
|
|
query = {}
|
|
if status:
|
|
query["status"] = status
|
|
cursor = collection.find(query).sort(
|
|
"created_at", DESCENDING).limit(limit)
|
|
subs = []
|
|
for doc in cursor:
|
|
doc["_id"] = str(doc["_id"])
|
|
subs.append(doc)
|
|
return subs
|
|
except Exception as e:
|
|
logger.error(f"Error fetching submissions: {e}")
|
|
return []
|
|
|
|
|
|
def get_pending_submissions() -> List[Dict[str, Any]]:
|
|
"""Returns pending submissions ordered by queue_position ascending."""
|
|
try:
|
|
collection = get_submissions_collection()
|
|
cursor = collection.find({"status": "pending"}).sort(
|
|
"queue_position", ASCENDING)
|
|
subs = []
|
|
for doc in cursor:
|
|
doc["_id"] = str(doc["_id"])
|
|
subs.append(doc)
|
|
return subs
|
|
except Exception as e:
|
|
logger.error(f"Error fetching pending submissions: {e}")
|
|
return []
|
|
|
|
|
|
def get_next_pending_submission() -> Optional[Dict[str, Any]]:
|
|
"""Return the next pending submission (lowest queue_position)."""
|
|
try:
|
|
collection = get_submissions_collection()
|
|
doc = collection.find_one({"status": "pending"}, sort=[
|
|
("queue_position", ASCENDING)])
|
|
if doc:
|
|
doc["_id"] = str(doc["_id"])
|
|
return doc
|
|
except Exception as e:
|
|
logger.error(f"Error fetching next pending submission: {e}")
|
|
return None
|
|
|
|
|
|
def update_submission(
|
|
submission_id: str,
|
|
status: str,
|
|
results: Optional[List[Dict]] = None,
|
|
error_message: Optional[str] = None
|
|
) -> bool:
|
|
"""Updates the status and results of a submission and manages timestamps/queue position."""
|
|
try:
|
|
collection = get_submissions_collection()
|
|
update_data = {
|
|
"status": status,
|
|
"updated_at": datetime.utcnow()
|
|
}
|
|
|
|
if status == "processing":
|
|
update_data["processing_started_at"] = datetime.utcnow()
|
|
elif status in ["completed", "failed"]:
|
|
update_data["processing_completed_at"] = datetime.utcnow()
|
|
update_data["queue_position"] = None
|
|
if status == "completed" and results is not None:
|
|
update_data["results"] = results
|
|
if status == "failed" and error_message is not None:
|
|
update_data["error_message"] = error_message
|
|
|
|
result = collection.update_one(
|
|
{"submission_id": submission_id}, {"$set": update_data})
|
|
if result.modified_count > 0:
|
|
logger.debug(
|
|
f"Updated submission {submission_id} to status {status}")
|
|
return True
|
|
else:
|
|
logger.warning(f"No submission found with ID: {submission_id}")
|
|
return False
|
|
except Exception as e:
|
|
logger.error(f"Error updating submission {submission_id}: {e}")
|
|
return False
|
|
|
|
|
|
def delete_submission(submission_id: str) -> bool:
|
|
"""Deletes a submission by its ID."""
|
|
try:
|
|
collection = get_submissions_collection()
|
|
result = collection.delete_one({"submission_id": submission_id})
|
|
if result.deleted_count > 0:
|
|
logger.debug(f"Deleted submission: {submission_id}")
|
|
return True
|
|
else:
|
|
logger.warning(f"No submission found with ID: {submission_id}")
|
|
return False
|
|
except Exception as e:
|
|
logger.error(f"Error deleting submission {submission_id}: {e}")
|
|
return False
|
|
|
|
|
|
def increment_retry_count(submission_id: str) -> bool:
|
|
"""Increment retry_count and set last_retry_at/updated_at."""
|
|
try:
|
|
collection = get_submissions_collection()
|
|
result = collection.update_one(
|
|
{"submission_id": submission_id},
|
|
{
|
|
"$inc": {"retry_count": 1},
|
|
"$set": {"last_retry_at": datetime.utcnow(), "updated_at": datetime.utcnow()}
|
|
}
|
|
)
|
|
return result.modified_count > 0
|
|
except Exception as e:
|
|
logger.error(f"Error increment retry count for {submission_id}: {e}")
|
|
return False
|
|
|
|
|
|
def requeue_stuck_submissions(timeout_minutes: int = 30) -> int:
|
|
"""Requeue submissions stuck in processing longer than timeout_minutes back to pending."""
|
|
try:
|
|
collection = get_submissions_collection()
|
|
timeout_date = datetime.utcnow() - timedelta(minutes=timeout_minutes)
|
|
result = collection.update_many(
|
|
{"status": "processing", "processing_started_at": {"$lt": timeout_date}},
|
|
{"$set": {"status": "pending", "updated_at": datetime.utcnow(
|
|
), "processing_started_at": None}, "$inc": {"retry_count": 1}}
|
|
)
|
|
logger.debug(f"Requeued {result.modified_count} stuck submissions")
|
|
return result.modified_count
|
|
except Exception as e:
|
|
logger.error(f"Error requeue stuck submissions: {e}")
|
|
return 0
|
|
|
|
|
|
def cleanup_excess_submissions(max_keep: int = 15) -> int:
|
|
"""Keep only the newest `max_keep` completed/failed submissions; delete older ones."""
|
|
try:
|
|
collection = get_submissions_collection()
|
|
count = collection.count_documents(
|
|
{"status": {"$in": ["completed", "failed"]}})
|
|
if count <= max_keep:
|
|
logger.debug(
|
|
f"Current completed/failed count ({count}) <= max_keep ({max_keep}), nothing to cleanup")
|
|
return 0
|
|
to_delete = count - max_keep
|
|
old_docs = list(collection.find({"status": {"$in": ["completed", "failed"]}}, {
|
|
"_id": 1}).sort("created_at", ASCENDING).limit(to_delete))
|
|
if not old_docs:
|
|
return 0
|
|
ids = [d["_id"] for d in old_docs]
|
|
result = collection.delete_many({"_id": {"$in": ids}})
|
|
logger.debug(f"Cleaned up {result.deleted_count} excess submissions")
|
|
return result.deleted_count
|
|
except Exception as e:
|
|
logger.error(f"Error cleanup excess submissions: {e}")
|
|
return 0
|
|
|
|
|
|
def cleanup_old_submissions(days: int = 30) -> int:
|
|
"""Delete completed/failed submissions older than `days` days."""
|
|
try:
|
|
collection = get_submissions_collection()
|
|
cutoff = datetime.utcnow() - timedelta(days=days)
|
|
result = collection.delete_many(
|
|
{"created_at": {"$lt": cutoff}, "status": {"$in": ["completed", "failed"]}})
|
|
logger.debug(f"Cleaned up {result.deleted_count} old submissions")
|
|
return result.deleted_count
|
|
except Exception as e:
|
|
logger.error(f"Error cleanup old submissions: {e}")
|
|
return 0
|
|
|
|
|
|
def get_statistics() -> Dict[str, int]:
|
|
"""Return counts grouped by status and total."""
|
|
try:
|
|
collection = get_submissions_collection()
|
|
pipeline = [{"$group": {"_id": "$status", "count": {"$sum": 1}}}]
|
|
results = list(collection.aggregate(pipeline))
|
|
stats = {"total": 0, "pending": 0,
|
|
"processing": 0, "completed": 0, "failed": 0}
|
|
for item in results:
|
|
status = item.get("_id")
|
|
count = item.get("count", 0)
|
|
if status in stats:
|
|
stats[status] = count
|
|
stats["total"] += count
|
|
return stats
|
|
except Exception as e:
|
|
logger.error(f"Error getting statistics: {e}")
|
|
return {"total": 0, "pending": 0, "processing": 0, "completed": 0, "failed": 0}
|
|
|
|
# ---------------------------------------------------
|
|
# Titles Logic (from mongodb.py)
|
|
# ---------------------------------------------------
|
|
# Note: This part is not directly used by the permission page,
|
|
# but it's good to have it here for future use.
|
|
|
|
|
|
def get_tms_data(ge_id: str, orig_lang: str) -> Optional[str]:
|
|
"""
|
|
Fetches the TMS ID from the titles_data collection.
|
|
Returns the TMS ID as a string or None if not found.
|
|
"""
|
|
try:
|
|
collection = get_titles_collection()
|
|
query = {"geId": str(ge_id).strip(), "lang": str(
|
|
orig_lang).strip().upper()}
|
|
document = collection.find_one(query)
|
|
if not document:
|
|
logger.warning(
|
|
f"No document found for geId: {ge_id}, lang: {orig_lang}")
|
|
return None
|
|
# Try extract from trTmsLink first
|
|
tms_link = document.get("trTmsLink")
|
|
if tms_link and isinstance(tms_link, str):
|
|
match = re.search(r'/project/(\d+)', tms_link)
|
|
if match:
|
|
return match.group(1)
|
|
tms_id_direct = document.get("tmsId")
|
|
if tms_id_direct:
|
|
return str(tms_id_direct).strip()
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Error querying MongoDB for TMS data: {e}")
|
|
return None
|
|
|
|
|
|
def get_path_from_tms_data(ge_id: str, orig_lang: str) -> Optional[str]:
|
|
"""
|
|
Fetches the NAS path from the titles_data collection for raw file downloads.
|
|
Uses the same query logic as get_tms_data but returns the 'path' field.
|
|
Returns the path as a string or None if not found.
|
|
"""
|
|
try:
|
|
collection = get_titles_collection()
|
|
query = {"geId": str(ge_id).strip(), "lang": str(
|
|
orig_lang).strip().upper()}
|
|
document = collection.find_one(query)
|
|
if not document:
|
|
logger.warning(
|
|
f"No document found for geId: {ge_id}, lang: {orig_lang}")
|
|
return None
|
|
|
|
# Get the path field directly
|
|
path = document.get("path")
|
|
if path and isinstance(path, str):
|
|
return str(path).strip()
|
|
|
|
logger.warning(
|
|
f"No path field found for geId: {ge_id}, lang: {orig_lang}")
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Error querying MongoDB for path data: {e}")
|
|
return None
|
|
|
|
|
|
def get_sharing_link_from_tms_data(ge_id: str, orig_lang: str) -> Optional[str]:
|
|
"""
|
|
Fetches the sharing link (linkRaw) from the titles_data collection.
|
|
Used for displaying source in sharing mode downloads.
|
|
Returns the linkRaw as a string or None if not found.
|
|
"""
|
|
try:
|
|
collection = get_titles_collection()
|
|
query = {"geId": str(ge_id).strip(), "lang": str(
|
|
orig_lang).strip().upper()}
|
|
document = collection.find_one(query)
|
|
if not document:
|
|
logger.warning(
|
|
f"No document found for geId: {ge_id}, lang: {orig_lang}")
|
|
return None
|
|
|
|
# Get the linkRaw field
|
|
link_raw = document.get("linkRaw")
|
|
if link_raw and isinstance(link_raw, str):
|
|
return str(link_raw).strip()
|
|
|
|
logger.warning(
|
|
f"No linkRaw field found for geId: {ge_id}, lang: {orig_lang}")
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Error querying MongoDB for linkRaw data: {e}")
|
|
return None
|