#!/usr/bin/env python3 """ Sincronizar metadata desde JSON del bucket R2 a storage.physical_blobs """ import os import json import boto3 import asyncio import asyncpg R2_ENDPOINT = "https://7dedae6030f5554d99d37e98a5232996.r2.cloudflarestorage.com" R2_BUCKET = "deck" def get_s3_client(): return boto3.client( "s3", endpoint_url=R2_ENDPOINT, aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"), aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"), ) async def sync(): s3 = get_s3_client() pool = await asyncpg.create_pool( "postgresql:///tzzr?host=/var/run/postgresql", min_size=2, max_size=10 ) async with pool.acquire() as conn: blobs = await conn.fetch(""" SELECT content_hash, storage_path FROM storage.physical_blobs WHERE file_size = 0 """) print(f"Sincronizando metadata para {len(blobs)} blobs...") updated = 0 errors = 0 for blob in blobs: hash = blob["content_hash"] json_key = f"{hash}.json" try: obj = s3.get_object(Bucket=R2_BUCKET, Key=json_key) meta = json.loads(obj["Body"].read()) # Extraer datos l2 = meta.get("jsonb_standard", {}).get("L2_document", {}) size_bytes = l2.get("size_bytes", 0) mime_type = l2.get("mime_type") ext = meta.get("ext", "pdf") url_atc = meta.get("url_atc", []) storage_path = url_atc[0] if url_atc else f"{hash}.{ext}" if not mime_type: if ext == "pdf": mime_type = "application/pdf" elif ext in ("jpg", "jpeg"): mime_type = "image/jpeg" elif ext == "png": mime_type = "image/png" else: mime_type = "application/octet-stream" # Obtener size real del archivo si no está en JSON if size_bytes == 0: try: file_obj = s3.head_object(Bucket=R2_BUCKET, Key=storage_path) size_bytes = file_obj.get("ContentLength", 0) except: pass # Actualizar registro await conn.execute(""" UPDATE storage.physical_blobs SET file_size = $2, mime_type = $3, storage_path = $4 WHERE content_hash = $1 """, hash, size_bytes, mime_type, storage_path) updated += 1 if updated % 100 == 0: print(f" Actualizados: {updated}") except s3.exceptions.NoSuchKey: # JSON no existe, intentar obtener size del archivo directamente try: # Probar diferentes extensiones for ext in ["pdf", "png", "jpg"]: try: file_key = f"{hash}.{ext}" file_obj = s3.head_object(Bucket=R2_BUCKET, Key=file_key) size_bytes = file_obj.get("ContentLength", 0) content_type = file_obj.get("ContentType", "application/octet-stream") await conn.execute(""" UPDATE storage.physical_blobs SET file_size = $2, mime_type = $3, storage_path = $4 WHERE content_hash = $1 """, hash, size_bytes, content_type, file_key) updated += 1 break except: continue except Exception as e: errors += 1 except Exception as e: errors += 1 print(f"Error en {hash}: {e}") print(f"\nSincronización completada:") print(f" - Actualizados: {updated}") print(f" - Errores: {errors}") await pool.close() if __name__ == "__main__": asyncio.run(sync())