Files
captain-claude/apps/storage/sync_metadata.py
ARCHITECT 9b244138b5 Add pending apps and frontend components
- apps/captain-mobile: Mobile API service
- apps/flow-ui: Flow UI application
- apps/mindlink: Mindlink application
- apps/storage: Storage API and workers
- apps/tzzr-cli: TZZR CLI tool
- deck-frontend/backups: Historical TypeScript versions
- hst-frontend: Standalone HST frontend

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-16 18:26:59 +00:00

131 lines
4.3 KiB
Python

#!/usr/bin/env python3
"""
Sincronizar metadata desde JSON del bucket R2 a storage.physical_blobs
"""
import os
import json
import boto3
import asyncio
import asyncpg
R2_ENDPOINT = "https://7dedae6030f5554d99d37e98a5232996.r2.cloudflarestorage.com"
R2_BUCKET = "deck"
def get_s3_client():
return boto3.client(
"s3",
endpoint_url=R2_ENDPOINT,
aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"),
aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"),
)
async def sync():
s3 = get_s3_client()
pool = await asyncpg.create_pool(
"postgresql:///tzzr?host=/var/run/postgresql",
min_size=2, max_size=10
)
async with pool.acquire() as conn:
blobs = await conn.fetch("""
SELECT content_hash, storage_path
FROM storage.physical_blobs
WHERE file_size = 0
""")
print(f"Sincronizando metadata para {len(blobs)} blobs...")
updated = 0
errors = 0
for blob in blobs:
hash = blob["content_hash"]
json_key = f"{hash}.json"
try:
obj = s3.get_object(Bucket=R2_BUCKET, Key=json_key)
meta = json.loads(obj["Body"].read())
# Extraer datos
l2 = meta.get("jsonb_standard", {}).get("L2_document", {})
size_bytes = l2.get("size_bytes", 0)
mime_type = l2.get("mime_type")
ext = meta.get("ext", "pdf")
url_atc = meta.get("url_atc", [])
storage_path = url_atc[0] if url_atc else f"{hash}.{ext}"
if not mime_type:
if ext == "pdf":
mime_type = "application/pdf"
elif ext in ("jpg", "jpeg"):
mime_type = "image/jpeg"
elif ext == "png":
mime_type = "image/png"
else:
mime_type = "application/octet-stream"
# Obtener size real del archivo si no está en JSON
if size_bytes == 0:
try:
file_obj = s3.head_object(Bucket=R2_BUCKET, Key=storage_path)
size_bytes = file_obj.get("ContentLength", 0)
except:
pass
# Actualizar registro
await conn.execute("""
UPDATE storage.physical_blobs
SET file_size = $2,
mime_type = $3,
storage_path = $4
WHERE content_hash = $1
""", hash, size_bytes, mime_type, storage_path)
updated += 1
if updated % 100 == 0:
print(f" Actualizados: {updated}")
except s3.exceptions.NoSuchKey:
# JSON no existe, intentar obtener size del archivo directamente
try:
# Probar diferentes extensiones
for ext in ["pdf", "png", "jpg"]:
try:
file_key = f"{hash}.{ext}"
file_obj = s3.head_object(Bucket=R2_BUCKET, Key=file_key)
size_bytes = file_obj.get("ContentLength", 0)
content_type = file_obj.get("ContentType", "application/octet-stream")
await conn.execute("""
UPDATE storage.physical_blobs
SET file_size = $2,
mime_type = $3,
storage_path = $4
WHERE content_hash = $1
""", hash, size_bytes, content_type, file_key)
updated += 1
break
except:
continue
except Exception as e:
errors += 1
except Exception as e:
errors += 1
print(f"Error en {hash}: {e}")
print(f"\nSincronización completada:")
print(f" - Actualizados: {updated}")
print(f" - Errores: {errors}")
await pool.close()
if __name__ == "__main__":
asyncio.run(sync())