Add pending apps and frontend components

- apps/captain-mobile: Mobile API service
- apps/flow-ui: Flow UI application
- apps/mindlink: Mindlink application
- apps/storage: Storage API and workers
- apps/tzzr-cli: TZZR CLI tool
- deck-frontend/backups: Historical TypeScript versions
- hst-frontend: Standalone HST frontend

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
ARCHITECT
2026-01-16 18:26:59 +00:00
parent 17506aaee2
commit 9b244138b5
177 changed files with 15063 additions and 0 deletions

109
apps/storage/migrate_atc.py Normal file
View File

@@ -0,0 +1,109 @@
#!/usr/bin/env python3
"""
Migración: Importar archivos existentes de secretaria_clara.atc a storage
"""
import asyncio
import asyncpg
import os
import json
DB_URL = os.environ.get("DATABASE_URL", "postgresql://tzzr:tzzr@localhost:5432/tzzr")
async def migrate():
pool = await asyncpg.create_pool(DB_URL, min_size=2, max_size=10)
async with pool.acquire() as conn:
# Obtener archivos de atc que tienen hash y url_file
atc_files = await conn.fetch("""
SELECT
mrf,
private_mrf,
alias,
name_es,
ref,
ext,
jsonb_standard,
hashtags
FROM secretaria_clara.atc
WHERE jsonb_standard IS NOT NULL
AND jsonb_standard->'L2_document'->>'url_file' IS NOT NULL
""")
print(f"Encontrados {len(atc_files)} archivos en atc")
migrated = 0
skipped = 0
errors = 0
for file in atc_files:
try:
mrf = file["mrf"]
jsonb = file["jsonb_standard"] or {}
# Extraer datos
l2 = jsonb.get("L2_document", {})
url_file = l2.get("url_file")
size_bytes = l2.get("size_bytes", 0)
mime_type = l2.get("mime_type", "application/octet-stream")
if not url_file:
skipped += 1
continue
# Verificar si ya existe en storage
existing = await conn.fetchrow("""
SELECT content_hash FROM storage.physical_blobs
WHERE content_hash = $1
""", mrf)
if existing:
skipped += 1
continue
# Insertar en physical_blobs
await conn.execute("""
INSERT INTO storage.physical_blobs
(content_hash, file_size, mime_type, storage_provider, storage_path, verification_status)
VALUES ($1, $2, $3, 'R2_PRIMARY', $4, 'VERIFIED')
""", mrf, size_bytes, mime_type, url_file)
# Crear user_asset con el mismo public_key que el mrf
# Usamos un UUID dummy para user_id ya que no tenemos usuarios
await conn.execute("""
INSERT INTO storage.user_assets
(public_key, blob_hash, user_id, original_filename)
VALUES ($1, $2, '00000000-0000-0000-0000-000000000000'::uuid, $3)
ON CONFLICT (public_key) DO NOTHING
""", mrf, mrf, file["name_es"] or file["alias"] or mrf[:20])
migrated += 1
if migrated % 100 == 0:
print(f" Migrados: {migrated}")
except Exception as e:
errors += 1
print(f"Error migrando {file['mrf']}: {e}")
print(f"\nMigración completada:")
print(f" - Migrados: {migrated}")
print(f" - Saltados (ya existían o sin datos): {skipped}")
print(f" - Errores: {errors}")
# Actualizar ref_count
await conn.execute("""
UPDATE storage.physical_blobs pb
SET ref_count = (
SELECT COUNT(*) FROM storage.user_assets ua
WHERE ua.blob_hash = pb.content_hash
)
""")
print(" - ref_count actualizado")
await pool.close()
if __name__ == "__main__":
asyncio.run(migrate())

View File

@@ -0,0 +1,9 @@
fastapi>=0.104.0
uvicorn>=0.24.0
asyncpg>=0.29.0
boto3>=1.34.0
Pillow>=10.0.0
PyMuPDF>=1.23.0
argon2-cffi>=23.1.0
python-multipart>=0.0.6
pydantic>=2.5.0

View File

@@ -0,0 +1,20 @@
[Unit]
Description=Storage API Server
After=network.target postgresql.service
[Service]
Type=simple
User=root
WorkingDirectory=/opt/storage
ExecStart=/opt/storage/venv/bin/python storage_api.py
Restart=always
RestartSec=5
Environment=R2_ENDPOINT=https://7dedae6030f5554d99d37e98a5232996.r2.cloudflarestorage.com
Environment=R2_BUCKET=deck
Environment=DATABASE_URL=postgresql://tzzr:tzzr@localhost:5432/tzzr
Environment=AWS_ACCESS_KEY_ID=
Environment=AWS_SECRET_ACCESS_KEY=
[Install]
WantedBy=multi-user.target

445
apps/storage/storage_api.py Normal file
View File

@@ -0,0 +1,445 @@
#!/usr/bin/env python3
"""
Storage API - Endpoints para upload/download de archivos
Spec: Sistema de Almacenamiento Híbrido v4.0
"""
import os
import hashlib
import json
import asyncio
from datetime import datetime, timedelta
from typing import Optional
import asyncpg
import boto3
from fastapi import FastAPI, HTTPException, Request, Header, Query, BackgroundTasks
from fastapi.responses import RedirectResponse, JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import uvicorn
from collections import defaultdict
import time
import argon2
# Configuración
R2_ENDPOINT = os.environ.get("R2_ENDPOINT", "https://7dedae6030f5554d99d37e98a5232996.r2.cloudflarestorage.com")
R2_BUCKET = os.environ.get("R2_BUCKET", "deck")
DB_URL = os.environ.get("DATABASE_URL", "postgresql://tzzr:tzzr@localhost:5432/tzzr")
PRESIGNED_UPLOAD_EXPIRY = 3 * 60 * 60 # 3 horas
PRESIGNED_DOWNLOAD_EXPIRY = 45 * 60 # 45 minutos
# Rate limiting
RATE_LIMIT_IP = 100 # req/min por IP
RATE_LIMIT_KEY = 50 # descargas/hora por public_key
RATE_LIMIT_TRANSFER = 10 * 1024 * 1024 * 1024 # 10 GB/hora
app = FastAPI(title="Storage API", version="4.0")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
# Estado global
db_pool = None
s3_client = None
rate_limits = {
"ip": defaultdict(list), # IP -> [timestamps]
"key": defaultdict(list), # public_key -> [timestamps]
"transfer": defaultdict(int) # IP -> bytes
}
ph = argon2.PasswordHasher()
# =========================================================================
# STARTUP / SHUTDOWN
# =========================================================================
@app.on_event("startup")
async def startup():
global db_pool, s3_client
db_pool = await asyncpg.create_pool(DB_URL, min_size=2, max_size=20)
s3_client = boto3.client(
"s3",
endpoint_url=R2_ENDPOINT,
aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"),
aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"),
)
@app.on_event("shutdown")
async def shutdown():
if db_pool:
await db_pool.close()
# =========================================================================
# RATE LIMITING
# =========================================================================
def check_rate_limit_ip(ip: str) -> bool:
"""100 req/min por IP"""
now = time.time()
minute_ago = now - 60
# Limpiar timestamps viejos
rate_limits["ip"][ip] = [t for t in rate_limits["ip"][ip] if t > minute_ago]
if len(rate_limits["ip"][ip]) >= RATE_LIMIT_IP:
return False
rate_limits["ip"][ip].append(now)
return True
def check_rate_limit_key(public_key: str) -> bool:
"""50 descargas/hora por public_key"""
now = time.time()
hour_ago = now - 3600
rate_limits["key"][public_key] = [t for t in rate_limits["key"][public_key] if t > hour_ago]
if len(rate_limits["key"][public_key]) >= RATE_LIMIT_KEY:
return False
rate_limits["key"][public_key].append(now)
return True
# =========================================================================
# MODELS
# =========================================================================
class UploadInitRequest(BaseModel):
hash: str
size: int
mime_type: str
filename: str
user_id: str
password: Optional[str] = None
class UploadInitResponse(BaseModel):
status: str
presigned_url: Optional[str] = None
deduplicated: bool = False
public_key: Optional[str] = None
# =========================================================================
# UPLOAD ENDPOINTS
# =========================================================================
@app.post("/upload/init", response_model=UploadInitResponse)
async def upload_init(req: UploadInitRequest, request: Request, background_tasks: BackgroundTasks):
"""
Iniciar upload. Devuelve presigned URL o confirma deduplicación.
"""
client_ip = request.client.host
if not check_rate_limit_ip(client_ip):
raise HTTPException(429, "Rate limit exceeded")
async with db_pool.acquire() as conn:
# Verificar si blob ya existe
blob = await conn.fetchrow("""
SELECT content_hash, verification_status
FROM storage.physical_blobs
WHERE content_hash = $1
""", req.hash)
if blob:
if blob["verification_status"] == "VERIFIED":
# Deduplicación: crear asset sin subir
public_key = hashlib.sha256(
f"{req.hash}{req.user_id}{datetime.now().isoformat()}".encode()
).hexdigest()
password_hash = None
if req.password:
password_hash = ph.hash(req.password)
await conn.execute("""
INSERT INTO storage.user_assets
(public_key, blob_hash, user_id, original_filename, access_password)
VALUES ($1, $2, $3, $4, $5)
""", public_key, req.hash, req.user_id, req.filename, password_hash)
return UploadInitResponse(
status="created",
deduplicated=True,
public_key=public_key
)
# Blob existe pero PENDING - cliente debe subir de todas formas
else:
# Crear registro PENDING
storage_path = f"{req.hash}.bin"
await conn.execute("""
INSERT INTO storage.physical_blobs
(content_hash, file_size, mime_type, storage_provider, storage_path)
VALUES ($1, $2, $3, 'R2_PRIMARY', $4)
""", req.hash, req.size, req.mime_type, storage_path)
# Generar presigned URL para upload
storage_path = f"{req.hash}.bin"
presigned_url = s3_client.generate_presigned_url(
"put_object",
Params={
"Bucket": R2_BUCKET,
"Key": storage_path,
"ContentType": req.mime_type
},
ExpiresIn=PRESIGNED_UPLOAD_EXPIRY
)
return UploadInitResponse(
status="upload_required",
presigned_url=presigned_url,
deduplicated=False
)
@app.post("/upload/complete/{content_hash}")
async def upload_complete(
content_hash: str,
user_id: str = Query(...),
filename: str = Query(...),
password: Optional[str] = Query(None),
background_tasks: BackgroundTasks = None
):
"""
Confirmar upload completado. Encola verificación.
"""
async with db_pool.acquire() as conn:
blob = await conn.fetchrow("""
SELECT content_hash, storage_path
FROM storage.physical_blobs
WHERE content_hash = $1
""", content_hash)
if not blob:
raise HTTPException(404, "Blob not found")
# Encolar verificación en background
# En producción esto iría a una cola (Redis, RabbitMQ, etc.)
background_tasks.add_task(
verify_and_finalize,
content_hash,
blob["storage_path"],
user_id,
filename,
password
)
return {"status": "processing", "content_hash": content_hash}
async def verify_and_finalize(
content_hash: str,
storage_path: str,
user_id: str,
filename: str,
password: Optional[str]
):
"""Background task para verificar y finalizar upload"""
from storage_worker import StorageWorker
worker = StorageWorker()
await worker.init()
try:
result = await worker.process_upload(
content_hash,
storage_path,
user_id,
filename,
ph.hash(password) if password else None
)
# En producción: notificar cliente via webhook/websocket
print(f"Upload finalized: {result}")
finally:
await worker.close()
# =========================================================================
# DOWNLOAD ENDPOINTS
# =========================================================================
@app.get("/file/{public_key}")
async def download_file(
public_key: str,
request: Request,
password: Optional[str] = Query(None)
):
"""
Descarga de archivo. Devuelve redirect a URL firmada.
"""
client_ip = request.client.host
# Rate limiting
if not check_rate_limit_ip(client_ip):
raise HTTPException(429, "Rate limit exceeded - IP")
if not check_rate_limit_key(public_key):
raise HTTPException(429, "Rate limit exceeded - downloads")
async with db_pool.acquire() as conn:
# Buscar asset
asset = await conn.fetchrow("""
SELECT a.id, a.blob_hash, a.original_filename, a.access_password, a.downloads_count,
b.storage_provider, b.storage_path, b.verification_status, b.mime_type
FROM storage.user_assets a
JOIN storage.physical_blobs b ON a.blob_hash = b.content_hash
WHERE a.public_key = $1
""", public_key)
if not asset:
raise HTTPException(404, "Asset not found")
# Verificar contraseña si requerida
if asset["access_password"]:
if not password:
raise HTTPException(401, "Password required")
try:
ph.verify(asset["access_password"], password)
except:
raise HTTPException(401, "Invalid password")
# Verificar estado del blob
status = asset["verification_status"]
if status == "PENDING":
raise HTTPException(202, "File is being processed")
if status in ("CORRUPT", "LOST"):
raise HTTPException(410, "File is no longer available")
# Incrementar contador de descargas
await conn.execute("""
UPDATE storage.user_assets
SET downloads_count = downloads_count + 1
WHERE id = $1
""", asset["id"])
# Generar URL firmada según provider
provider = asset["storage_provider"]
if provider in ("R2_PRIMARY", "R2_CACHE"):
presigned_url = s3_client.generate_presigned_url(
"get_object",
Params={
"Bucket": R2_BUCKET,
"Key": asset["storage_path"],
"ResponseContentDisposition": f'attachment; filename="{asset["original_filename"]}"',
"ResponseContentType": asset["mime_type"]
},
ExpiresIn=PRESIGNED_DOWNLOAD_EXPIRY
)
return RedirectResponse(presigned_url, status_code=302)
elif provider == "SHAREPOINT":
# TODO: Implementar acceso SharePoint via Graph API
raise HTTPException(503, "SharePoint access not implemented")
else:
raise HTTPException(503, "Unknown storage provider")
@app.get("/file/{public_key}/info")
async def file_info(public_key: str, request: Request):
"""
Información del archivo sin descargarlo.
"""
client_ip = request.client.host
if not check_rate_limit_ip(client_ip):
raise HTTPException(429, "Rate limit exceeded")
async with db_pool.acquire() as conn:
asset = await conn.fetchrow("""
SELECT a.public_key, a.original_filename, a.downloads_count, a.created_at,
b.file_size, b.mime_type, b.verification_status,
(a.access_password IS NOT NULL) as password_protected
FROM storage.user_assets a
JOIN storage.physical_blobs b ON a.blob_hash = b.content_hash
WHERE a.public_key = $1
""", public_key)
if not asset:
raise HTTPException(404, "Asset not found")
return {
"public_key": asset["public_key"],
"filename": asset["original_filename"],
"size": asset["file_size"],
"mime_type": asset["mime_type"],
"status": asset["verification_status"],
"downloads": asset["downloads_count"],
"password_protected": asset["password_protected"],
"created_at": asset["created_at"].isoformat()
}
@app.get("/file/{public_key}/thumb")
async def file_thumbnail(public_key: str, request: Request):
"""
Redirect al thumbnail del archivo.
"""
client_ip = request.client.host
if not check_rate_limit_ip(client_ip):
raise HTTPException(429, "Rate limit exceeded")
async with db_pool.acquire() as conn:
asset = await conn.fetchrow("""
SELECT a.blob_hash, b.verification_status
FROM storage.user_assets a
JOIN storage.physical_blobs b ON a.blob_hash = b.content_hash
WHERE a.public_key = $1
""", public_key)
if not asset:
raise HTTPException(404, "Asset not found")
if asset["verification_status"] != "VERIFIED":
raise HTTPException(202, "Thumbnail not ready")
# URL al thumbnail
thumb_key = f"{asset['blob_hash']}.thumb"
try:
# Verificar que existe
s3_client.head_object(Bucket=R2_BUCKET, Key=thumb_key)
except:
raise HTTPException(404, "Thumbnail not available")
presigned_url = s3_client.generate_presigned_url(
"get_object",
Params={"Bucket": R2_BUCKET, "Key": thumb_key},
ExpiresIn=PRESIGNED_DOWNLOAD_EXPIRY
)
return RedirectResponse(presigned_url, status_code=302)
# =========================================================================
# HEALTH
# =========================================================================
@app.get("/health")
async def health():
return {"status": "ok", "timestamp": datetime.now().isoformat()}
# =========================================================================
# MAIN
# =========================================================================
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8080)

View File

@@ -0,0 +1,480 @@
#!/usr/bin/env python3
"""
Storage Worker - Verificación y procesamiento de archivos
Spec: Sistema de Almacenamiento Híbrido v4.0
"""
import os
import hashlib
import json
import asyncio
import asyncpg
from datetime import datetime
from typing import Optional, Dict, Any
import boto3
from PIL import Image
import fitz # PyMuPDF
import io
import tempfile
# Configuración
R2_ENDPOINT = os.environ.get("R2_ENDPOINT", "https://7dedae6030f5554d99d37e98a5232996.r2.cloudflarestorage.com")
R2_BUCKET = os.environ.get("R2_BUCKET", "deck")
DB_URL = os.environ.get("DATABASE_URL", "postgresql://tzzr:tzzr@localhost:5432/tzzr")
THUMB_WIDTH = 300
MAX_RETRIES = 9
RETRY_BACKOFF_BASE = 2
def get_s3_client():
return boto3.client(
"s3",
endpoint_url=R2_ENDPOINT,
aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"),
aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"),
)
async def get_db_pool():
return await asyncpg.create_pool(DB_URL, min_size=2, max_size=10)
def calculate_sha256(data: bytes) -> str:
"""Calcula SHA-256 de bytes"""
return hashlib.sha256(data).hexdigest()
def generate_public_key(content_hash: str, user_id: str) -> str:
"""Genera public_key única para un asset"""
data = f"{content_hash}{user_id}{datetime.now().isoformat()}"
return hashlib.sha256(data.encode()).hexdigest()
class StorageWorker:
def __init__(self):
self.s3 = get_s3_client()
self.pool = None
async def init(self):
self.pool = await get_db_pool()
async def close(self):
if self.pool:
await self.pool.close()
# =========================================================================
# VERIFICACIÓN DE HASH
# =========================================================================
async def verify_blob(self, declared_hash: str, storage_path: str) -> Dict[str, Any]:
"""
Verifica que el hash declarado coincida con el contenido real.
NUNCA confiamos en el hash del cliente.
"""
try:
# Descargar archivo
obj = self.s3.get_object(Bucket=R2_BUCKET, Key=storage_path)
content = obj["Body"].read()
# Calcular hash real
calculated_hash = calculate_sha256(content)
if calculated_hash != declared_hash:
# HASH MISMATCH - Archivo corrupto o spoofing
await self._mark_corrupt(declared_hash, storage_path)
return {
"status": "CORRUPT",
"declared": declared_hash,
"calculated": calculated_hash,
"action": "deleted"
}
# Hash coincide - Marcar como verificado
await self._mark_verified(declared_hash)
return {
"status": "VERIFIED",
"hash": declared_hash,
"size": len(content)
}
except Exception as e:
return {"status": "ERROR", "error": str(e)}
async def _mark_corrupt(self, content_hash: str, storage_path: str):
"""Marca blob como corrupto y elimina archivo"""
async with self.pool.acquire() as conn:
await conn.execute("""
UPDATE storage.physical_blobs
SET verification_status = 'CORRUPT', updated_at = NOW()
WHERE content_hash = $1
""", content_hash)
# Eliminar archivo del bucket
try:
self.s3.delete_object(Bucket=R2_BUCKET, Key=storage_path)
except:
pass
async def _mark_verified(self, content_hash: str):
"""Marca blob como verificado"""
async with self.pool.acquire() as conn:
await conn.execute("""
UPDATE storage.physical_blobs
SET verification_status = 'VERIFIED',
last_verified_at = NOW(),
updated_at = NOW()
WHERE content_hash = $1
""", content_hash)
# =========================================================================
# GENERACIÓN DE DERIVADOS
# =========================================================================
async def generate_derivatives(self, content_hash: str) -> Dict[str, Any]:
"""Genera thumbnail y metadatos para un blob verificado"""
async with self.pool.acquire() as conn:
blob = await conn.fetchrow("""
SELECT content_hash, mime_type, storage_path, file_size
FROM storage.physical_blobs
WHERE content_hash = $1 AND verification_status = 'VERIFIED'
""", content_hash)
if not blob:
return {"status": "ERROR", "error": "Blob not found or not verified"}
mime_type = blob["mime_type"]
storage_path = blob["storage_path"]
# Descargar archivo
obj = self.s3.get_object(Bucket=R2_BUCKET, Key=storage_path)
content = obj["Body"].read()
metadata = {
"content_hash": content_hash,
"mime_type": mime_type,
"file_size": blob["file_size"],
"processed_at": datetime.now().isoformat()
}
thumb_generated = False
# Generar thumbnail según tipo
if mime_type.startswith("image/"):
thumb_data, extra_meta = self._process_image(content)
metadata.update(extra_meta)
if thumb_data:
await self._save_thumb(content_hash, thumb_data)
thumb_generated = True
elif mime_type == "application/pdf":
thumb_data, extra_meta = self._process_pdf(content)
metadata.update(extra_meta)
if thumb_data:
await self._save_thumb(content_hash, thumb_data)
thumb_generated = True
# Guardar metadatos
await self._save_metadata(content_hash, metadata)
return {
"status": "OK",
"thumb_generated": thumb_generated,
"metadata": metadata
}
def _process_image(self, content: bytes) -> tuple:
"""Procesa imagen: genera thumb y extrae metadatos"""
try:
img = Image.open(io.BytesIO(content))
# Metadatos
meta = {
"width": img.width,
"height": img.height,
"format": img.format,
"mode": img.mode
}
# EXIF si disponible
if hasattr(img, '_getexif') and img._getexif():
meta["has_exif"] = True
# Generar thumbnail
ratio = THUMB_WIDTH / img.width
new_height = int(img.height * ratio)
thumb = img.copy()
thumb.thumbnail((THUMB_WIDTH, new_height), Image.Resampling.LANCZOS)
# Convertir a bytes
thumb_buffer = io.BytesIO()
thumb.save(thumb_buffer, format="JPEG", quality=85)
thumb_data = thumb_buffer.getvalue()
return thumb_data, meta
except Exception as e:
return None, {"error": str(e)}
def _process_pdf(self, content: bytes) -> tuple:
"""Procesa PDF: genera thumb de primera página y extrae metadatos"""
try:
doc = fitz.open(stream=content, filetype="pdf")
meta = {
"pages": len(doc),
"format": "PDF"
}
# Metadatos del documento
pdf_meta = doc.metadata
if pdf_meta:
if pdf_meta.get("author"):
meta["author"] = pdf_meta["author"]
if pdf_meta.get("title"):
meta["title"] = pdf_meta["title"]
# Render primera página como thumbnail
if len(doc) > 0:
page = doc[0]
# Escalar para que el ancho sea THUMB_WIDTH
zoom = THUMB_WIDTH / page.rect.width
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat)
thumb_data = pix.tobytes("jpeg")
else:
thumb_data = None
doc.close()
return thumb_data, meta
except Exception as e:
return None, {"error": str(e)}
async def _save_thumb(self, content_hash: str, thumb_data: bytes):
"""Guarda thumbnail en el bucket"""
key = f"{content_hash}.thumb"
self.s3.put_object(
Bucket=R2_BUCKET,
Key=key,
Body=thumb_data,
ContentType="image/jpeg"
)
async def _save_metadata(self, content_hash: str, metadata: dict):
"""Guarda metadatos JSON en el bucket"""
key = f"{content_hash}.json"
self.s3.put_object(
Bucket=R2_BUCKET,
Key=key,
Body=json.dumps(metadata, indent=2),
ContentType="application/json"
)
# =========================================================================
# PROCESAMIENTO COMPLETO
# =========================================================================
async def process_upload(
self,
declared_hash: str,
storage_path: str,
user_id: str,
original_filename: str,
access_password: Optional[str] = None
) -> Dict[str, Any]:
"""
Proceso completo post-upload:
1. Verificar hash
2. Generar derivados
3. Crear user_asset
"""
# 1. Verificar hash
verify_result = await self.verify_blob(declared_hash, storage_path)
if verify_result["status"] != "VERIFIED":
return verify_result
# 2. Generar derivados (con reintentos)
for attempt in range(MAX_RETRIES):
try:
deriv_result = await self.generate_derivatives(declared_hash)
if deriv_result["status"] == "OK":
break
except Exception as e:
if attempt == MAX_RETRIES - 1:
# Último intento fallido, pero blob ya está verificado
deriv_result = {"status": "PARTIAL", "error": str(e)}
else:
await asyncio.sleep(RETRY_BACKOFF_BASE ** attempt)
# 3. Crear user_asset
public_key = generate_public_key(declared_hash, user_id)
async with self.pool.acquire() as conn:
await conn.execute("""
INSERT INTO storage.user_assets
(public_key, blob_hash, user_id, original_filename, access_password)
VALUES ($1, $2, $3, $4, $5)
""", public_key, declared_hash, user_id, original_filename, access_password)
return {
"status": "CREATED",
"public_key": public_key,
"content_hash": declared_hash,
"derivatives": deriv_result
}
# =========================================================================
# REGISTRO DE BLOB (sin subida - para archivos existentes)
# =========================================================================
async def register_blob(
self,
content_hash: str,
file_size: int,
mime_type: str,
storage_provider: str,
storage_path: str
) -> Dict[str, Any]:
"""Registra un blob existente en el sistema"""
async with self.pool.acquire() as conn:
# Verificar si ya existe
existing = await conn.fetchrow("""
SELECT content_hash, verification_status
FROM storage.physical_blobs
WHERE content_hash = $1
""", content_hash)
if existing:
return {
"status": "EXISTS",
"content_hash": content_hash,
"verification_status": existing["verification_status"]
}
# Insertar nuevo blob
await conn.execute("""
INSERT INTO storage.physical_blobs
(content_hash, file_size, mime_type, storage_provider, storage_path)
VALUES ($1, $2, $3, $4::storage.storage_provider_enum, $5)
""", content_hash, file_size, mime_type, storage_provider, storage_path)
return {
"status": "REGISTERED",
"content_hash": content_hash,
"verification_status": "PENDING"
}
# =========================================================================
# MANTENIMIENTO
# =========================================================================
async def garbage_collect(self, dry_run: bool = True) -> Dict[str, Any]:
"""
Elimina blobs huérfanos (ref_count = 0, sin actualizar en 30 días)
"""
async with self.pool.acquire() as conn:
orphans = await conn.fetch("""
SELECT content_hash, storage_path
FROM storage.physical_blobs
WHERE ref_count = 0
AND updated_at < NOW() - INTERVAL '30 days'
""")
deleted = []
for blob in orphans:
if not dry_run:
# Eliminar derivados
for ext in [".thumb", ".json"]:
try:
self.s3.delete_object(Bucket=R2_BUCKET, Key=f"{blob['content_hash']}{ext}")
except:
pass
# Eliminar blob
try:
self.s3.delete_object(Bucket=R2_BUCKET, Key=blob["storage_path"])
except:
pass
# Eliminar registro
async with self.pool.acquire() as conn:
await conn.execute("""
DELETE FROM storage.physical_blobs WHERE content_hash = $1
""", blob["content_hash"])
deleted.append(blob["content_hash"])
return {
"status": "OK",
"dry_run": dry_run,
"orphans_found": len(orphans),
"deleted": deleted if not dry_run else []
}
async def integrity_check(self, sample_percent: float = 0.01) -> Dict[str, Any]:
"""
Verifica integridad de una muestra aleatoria de blobs
"""
async with self.pool.acquire() as conn:
blobs = await conn.fetch("""
SELECT content_hash, storage_path
FROM storage.physical_blobs
WHERE verification_status = 'VERIFIED'
ORDER BY RANDOM()
LIMIT (SELECT CEIL(COUNT(*) * $1) FROM storage.physical_blobs WHERE verification_status = 'VERIFIED')
""", sample_percent)
results = {"checked": 0, "ok": 0, "corrupt": []}
for blob in blobs:
results["checked"] += 1
verify = await self.verify_blob(blob["content_hash"], blob["storage_path"])
if verify["status"] == "VERIFIED":
results["ok"] += 1
else:
results["corrupt"].append(blob["content_hash"])
return results
# CLI para pruebas
async def main():
import sys
worker = StorageWorker()
await worker.init()
if len(sys.argv) < 2:
print("Usage: storage_worker.py <command> [args]")
print("Commands: gc, integrity, register")
return
cmd = sys.argv[1]
if cmd == "gc":
dry_run = "--execute" not in sys.argv
result = await worker.garbage_collect(dry_run=dry_run)
print(json.dumps(result, indent=2))
elif cmd == "integrity":
result = await worker.integrity_check()
print(json.dumps(result, indent=2))
elif cmd == "register":
if len(sys.argv) < 6:
print("Usage: storage_worker.py register <hash> <size> <mime> <path>")
return
result = await worker.register_blob(
sys.argv[2], int(sys.argv[3]), sys.argv[4], "R2_PRIMARY", sys.argv[5]
)
print(json.dumps(result, indent=2))
await worker.close()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,130 @@
#!/usr/bin/env python3
"""
Sincronizar metadata desde JSON del bucket R2 a storage.physical_blobs
"""
import os
import json
import boto3
import asyncio
import asyncpg
R2_ENDPOINT = "https://7dedae6030f5554d99d37e98a5232996.r2.cloudflarestorage.com"
R2_BUCKET = "deck"
def get_s3_client():
return boto3.client(
"s3",
endpoint_url=R2_ENDPOINT,
aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"),
aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"),
)
async def sync():
s3 = get_s3_client()
pool = await asyncpg.create_pool(
"postgresql:///tzzr?host=/var/run/postgresql",
min_size=2, max_size=10
)
async with pool.acquire() as conn:
blobs = await conn.fetch("""
SELECT content_hash, storage_path
FROM storage.physical_blobs
WHERE file_size = 0
""")
print(f"Sincronizando metadata para {len(blobs)} blobs...")
updated = 0
errors = 0
for blob in blobs:
hash = blob["content_hash"]
json_key = f"{hash}.json"
try:
obj = s3.get_object(Bucket=R2_BUCKET, Key=json_key)
meta = json.loads(obj["Body"].read())
# Extraer datos
l2 = meta.get("jsonb_standard", {}).get("L2_document", {})
size_bytes = l2.get("size_bytes", 0)
mime_type = l2.get("mime_type")
ext = meta.get("ext", "pdf")
url_atc = meta.get("url_atc", [])
storage_path = url_atc[0] if url_atc else f"{hash}.{ext}"
if not mime_type:
if ext == "pdf":
mime_type = "application/pdf"
elif ext in ("jpg", "jpeg"):
mime_type = "image/jpeg"
elif ext == "png":
mime_type = "image/png"
else:
mime_type = "application/octet-stream"
# Obtener size real del archivo si no está en JSON
if size_bytes == 0:
try:
file_obj = s3.head_object(Bucket=R2_BUCKET, Key=storage_path)
size_bytes = file_obj.get("ContentLength", 0)
except:
pass
# Actualizar registro
await conn.execute("""
UPDATE storage.physical_blobs
SET file_size = $2,
mime_type = $3,
storage_path = $4
WHERE content_hash = $1
""", hash, size_bytes, mime_type, storage_path)
updated += 1
if updated % 100 == 0:
print(f" Actualizados: {updated}")
except s3.exceptions.NoSuchKey:
# JSON no existe, intentar obtener size del archivo directamente
try:
# Probar diferentes extensiones
for ext in ["pdf", "png", "jpg"]:
try:
file_key = f"{hash}.{ext}"
file_obj = s3.head_object(Bucket=R2_BUCKET, Key=file_key)
size_bytes = file_obj.get("ContentLength", 0)
content_type = file_obj.get("ContentType", "application/octet-stream")
await conn.execute("""
UPDATE storage.physical_blobs
SET file_size = $2,
mime_type = $3,
storage_path = $4
WHERE content_hash = $1
""", hash, size_bytes, content_type, file_key)
updated += 1
break
except:
continue
except Exception as e:
errors += 1
except Exception as e:
errors += 1
print(f"Error en {hash}: {e}")
print(f"\nSincronización completada:")
print(f" - Actualizados: {updated}")
print(f" - Errores: {errors}")
await pool.close()
if __name__ == "__main__":
asyncio.run(sync())