captain-claude/apps/storage/sync_metadata.py

#!/usr/bin/env python3
"""
Sincronizar metadata desde JSON del bucket R2 a storage.physical_blobs
"""

import os
import json
import boto3
import asyncio
import asyncpg

R2_ENDPOINT = "https://7dedae6030f5554d99d37e98a5232996.r2.cloudflarestorage.com"
R2_BUCKET = "deck"


def get_s3_client():
    return boto3.client(
        "s3",
        endpoint_url=R2_ENDPOINT,
        aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"),
        aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"),
    )


async def sync():
    s3 = get_s3_client()
    pool = await asyncpg.create_pool(
        "postgresql:///tzzr?host=/var/run/postgresql",
        min_size=2, max_size=10
    )

    async with pool.acquire() as conn:
        blobs = await conn.fetch("""
            SELECT content_hash, storage_path
            FROM storage.physical_blobs
            WHERE file_size = 0
        """)

        print(f"Sincronizando metadata para {len(blobs)} blobs...")

        updated = 0
        errors = 0

        for blob in blobs:
            hash = blob["content_hash"]
            json_key = f"{hash}.json"

            try:
                obj = s3.get_object(Bucket=R2_BUCKET, Key=json_key)
                meta = json.loads(obj["Body"].read())

                # Extraer datos
                l2 = meta.get("jsonb_standard", {}).get("L2_document", {})
                size_bytes = l2.get("size_bytes", 0)
                mime_type = l2.get("mime_type")
                ext = meta.get("ext", "pdf")
                url_atc = meta.get("url_atc", [])
                storage_path = url_atc[0] if url_atc else f"{hash}.{ext}"

                if not mime_type:
                    if ext == "pdf":
                        mime_type = "application/pdf"
                    elif ext in ("jpg", "jpeg"):
                        mime_type = "image/jpeg"
                    elif ext == "png":
                        mime_type = "image/png"
                    else:
                        mime_type = "application/octet-stream"

                # Obtener size real del archivo si no está en JSON
                if size_bytes == 0:
                    try:
                        file_obj = s3.head_object(Bucket=R2_BUCKET, Key=storage_path)
                        size_bytes = file_obj.get("ContentLength", 0)
                    except:
                        pass

                # Actualizar registro
                await conn.execute("""
                    UPDATE storage.physical_blobs
                    SET file_size = $2,
                        mime_type = $3,
                        storage_path = $4
                    WHERE content_hash = $1
                """, hash, size_bytes, mime_type, storage_path)

                updated += 1

                if updated % 100 == 0:
                    print(f"  Actualizados: {updated}")

            except s3.exceptions.NoSuchKey:
                # JSON no existe, intentar obtener size del archivo directamente
                try:
                    # Probar diferentes extensiones
                    for ext in ["pdf", "png", "jpg"]:
                        try:
                            file_key = f"{hash}.{ext}"
                            file_obj = s3.head_object(Bucket=R2_BUCKET, Key=file_key)
                            size_bytes = file_obj.get("ContentLength", 0)
                            content_type = file_obj.get("ContentType", "application/octet-stream")

                            await conn.execute("""
                                UPDATE storage.physical_blobs
                                SET file_size = $2,
                                    mime_type = $3,
                                    storage_path = $4
                                WHERE content_hash = $1
                            """, hash, size_bytes, content_type, file_key)

                            updated += 1
                            break
                        except:
                            continue
                except Exception as e:
                    errors += 1

            except Exception as e:
                errors += 1
                print(f"Error en {hash}: {e}")

        print(f"\nSincronización completada:")
        print(f"  - Actualizados: {updated}")
        print(f"  - Errores: {errors}")

    await pool.close()


if __name__ == "__main__":
    asyncio.run(sync())