Add pending apps and frontend components
- apps/captain-mobile: Mobile API service - apps/flow-ui: Flow UI application - apps/mindlink: Mindlink application - apps/storage: Storage API and workers - apps/tzzr-cli: TZZR CLI tool - deck-frontend/backups: Historical TypeScript versions - hst-frontend: Standalone HST frontend Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
109
apps/storage/migrate_atc.py
Normal file
109
apps/storage/migrate_atc.py
Normal file
@@ -0,0 +1,109 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Migración: Importar archivos existentes de secretaria_clara.atc a storage
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import asyncpg
|
||||
import os
|
||||
import json
|
||||
|
||||
DB_URL = os.environ.get("DATABASE_URL", "postgresql://tzzr:tzzr@localhost:5432/tzzr")
|
||||
|
||||
|
||||
async def migrate():
|
||||
pool = await asyncpg.create_pool(DB_URL, min_size=2, max_size=10)
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
# Obtener archivos de atc que tienen hash y url_file
|
||||
atc_files = await conn.fetch("""
|
||||
SELECT
|
||||
mrf,
|
||||
private_mrf,
|
||||
alias,
|
||||
name_es,
|
||||
ref,
|
||||
ext,
|
||||
jsonb_standard,
|
||||
hashtags
|
||||
FROM secretaria_clara.atc
|
||||
WHERE jsonb_standard IS NOT NULL
|
||||
AND jsonb_standard->'L2_document'->>'url_file' IS NOT NULL
|
||||
""")
|
||||
|
||||
print(f"Encontrados {len(atc_files)} archivos en atc")
|
||||
|
||||
migrated = 0
|
||||
skipped = 0
|
||||
errors = 0
|
||||
|
||||
for file in atc_files:
|
||||
try:
|
||||
mrf = file["mrf"]
|
||||
jsonb = file["jsonb_standard"] or {}
|
||||
|
||||
# Extraer datos
|
||||
l2 = jsonb.get("L2_document", {})
|
||||
url_file = l2.get("url_file")
|
||||
size_bytes = l2.get("size_bytes", 0)
|
||||
mime_type = l2.get("mime_type", "application/octet-stream")
|
||||
|
||||
if not url_file:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
# Verificar si ya existe en storage
|
||||
existing = await conn.fetchrow("""
|
||||
SELECT content_hash FROM storage.physical_blobs
|
||||
WHERE content_hash = $1
|
||||
""", mrf)
|
||||
|
||||
if existing:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
# Insertar en physical_blobs
|
||||
await conn.execute("""
|
||||
INSERT INTO storage.physical_blobs
|
||||
(content_hash, file_size, mime_type, storage_provider, storage_path, verification_status)
|
||||
VALUES ($1, $2, $3, 'R2_PRIMARY', $4, 'VERIFIED')
|
||||
""", mrf, size_bytes, mime_type, url_file)
|
||||
|
||||
# Crear user_asset con el mismo public_key que el mrf
|
||||
# Usamos un UUID dummy para user_id ya que no tenemos usuarios
|
||||
await conn.execute("""
|
||||
INSERT INTO storage.user_assets
|
||||
(public_key, blob_hash, user_id, original_filename)
|
||||
VALUES ($1, $2, '00000000-0000-0000-0000-000000000000'::uuid, $3)
|
||||
ON CONFLICT (public_key) DO NOTHING
|
||||
""", mrf, mrf, file["name_es"] or file["alias"] or mrf[:20])
|
||||
|
||||
migrated += 1
|
||||
|
||||
if migrated % 100 == 0:
|
||||
print(f" Migrados: {migrated}")
|
||||
|
||||
except Exception as e:
|
||||
errors += 1
|
||||
print(f"Error migrando {file['mrf']}: {e}")
|
||||
|
||||
print(f"\nMigración completada:")
|
||||
print(f" - Migrados: {migrated}")
|
||||
print(f" - Saltados (ya existían o sin datos): {skipped}")
|
||||
print(f" - Errores: {errors}")
|
||||
|
||||
# Actualizar ref_count
|
||||
await conn.execute("""
|
||||
UPDATE storage.physical_blobs pb
|
||||
SET ref_count = (
|
||||
SELECT COUNT(*) FROM storage.user_assets ua
|
||||
WHERE ua.blob_hash = pb.content_hash
|
||||
)
|
||||
""")
|
||||
print(" - ref_count actualizado")
|
||||
|
||||
await pool.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(migrate())
|
||||
9
apps/storage/requirements.txt
Normal file
9
apps/storage/requirements.txt
Normal file
@@ -0,0 +1,9 @@
|
||||
fastapi>=0.104.0
|
||||
uvicorn>=0.24.0
|
||||
asyncpg>=0.29.0
|
||||
boto3>=1.34.0
|
||||
Pillow>=10.0.0
|
||||
PyMuPDF>=1.23.0
|
||||
argon2-cffi>=23.1.0
|
||||
python-multipart>=0.0.6
|
||||
pydantic>=2.5.0
|
||||
20
apps/storage/storage-api.service
Normal file
20
apps/storage/storage-api.service
Normal file
@@ -0,0 +1,20 @@
|
||||
[Unit]
|
||||
Description=Storage API Server
|
||||
After=network.target postgresql.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=root
|
||||
WorkingDirectory=/opt/storage
|
||||
ExecStart=/opt/storage/venv/bin/python storage_api.py
|
||||
Restart=always
|
||||
RestartSec=5
|
||||
|
||||
Environment=R2_ENDPOINT=https://7dedae6030f5554d99d37e98a5232996.r2.cloudflarestorage.com
|
||||
Environment=R2_BUCKET=deck
|
||||
Environment=DATABASE_URL=postgresql://tzzr:tzzr@localhost:5432/tzzr
|
||||
Environment=AWS_ACCESS_KEY_ID=
|
||||
Environment=AWS_SECRET_ACCESS_KEY=
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
445
apps/storage/storage_api.py
Normal file
445
apps/storage/storage_api.py
Normal file
@@ -0,0 +1,445 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Storage API - Endpoints para upload/download de archivos
|
||||
Spec: Sistema de Almacenamiento Híbrido v4.0
|
||||
"""
|
||||
|
||||
import os
|
||||
import hashlib
|
||||
import json
|
||||
import asyncio
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Optional
|
||||
import asyncpg
|
||||
import boto3
|
||||
from fastapi import FastAPI, HTTPException, Request, Header, Query, BackgroundTasks
|
||||
from fastapi.responses import RedirectResponse, JSONResponse
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from pydantic import BaseModel
|
||||
import uvicorn
|
||||
from collections import defaultdict
|
||||
import time
|
||||
import argon2
|
||||
|
||||
# Configuración
|
||||
R2_ENDPOINT = os.environ.get("R2_ENDPOINT", "https://7dedae6030f5554d99d37e98a5232996.r2.cloudflarestorage.com")
|
||||
R2_BUCKET = os.environ.get("R2_BUCKET", "deck")
|
||||
DB_URL = os.environ.get("DATABASE_URL", "postgresql://tzzr:tzzr@localhost:5432/tzzr")
|
||||
|
||||
PRESIGNED_UPLOAD_EXPIRY = 3 * 60 * 60 # 3 horas
|
||||
PRESIGNED_DOWNLOAD_EXPIRY = 45 * 60 # 45 minutos
|
||||
|
||||
# Rate limiting
|
||||
RATE_LIMIT_IP = 100 # req/min por IP
|
||||
RATE_LIMIT_KEY = 50 # descargas/hora por public_key
|
||||
RATE_LIMIT_TRANSFER = 10 * 1024 * 1024 * 1024 # 10 GB/hora
|
||||
|
||||
app = FastAPI(title="Storage API", version="4.0")
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# Estado global
|
||||
db_pool = None
|
||||
s3_client = None
|
||||
rate_limits = {
|
||||
"ip": defaultdict(list), # IP -> [timestamps]
|
||||
"key": defaultdict(list), # public_key -> [timestamps]
|
||||
"transfer": defaultdict(int) # IP -> bytes
|
||||
}
|
||||
|
||||
ph = argon2.PasswordHasher()
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# STARTUP / SHUTDOWN
|
||||
# =========================================================================
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup():
|
||||
global db_pool, s3_client
|
||||
db_pool = await asyncpg.create_pool(DB_URL, min_size=2, max_size=20)
|
||||
s3_client = boto3.client(
|
||||
"s3",
|
||||
endpoint_url=R2_ENDPOINT,
|
||||
aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"),
|
||||
aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"),
|
||||
)
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown():
|
||||
if db_pool:
|
||||
await db_pool.close()
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# RATE LIMITING
|
||||
# =========================================================================
|
||||
|
||||
def check_rate_limit_ip(ip: str) -> bool:
|
||||
"""100 req/min por IP"""
|
||||
now = time.time()
|
||||
minute_ago = now - 60
|
||||
|
||||
# Limpiar timestamps viejos
|
||||
rate_limits["ip"][ip] = [t for t in rate_limits["ip"][ip] if t > minute_ago]
|
||||
|
||||
if len(rate_limits["ip"][ip]) >= RATE_LIMIT_IP:
|
||||
return False
|
||||
|
||||
rate_limits["ip"][ip].append(now)
|
||||
return True
|
||||
|
||||
|
||||
def check_rate_limit_key(public_key: str) -> bool:
|
||||
"""50 descargas/hora por public_key"""
|
||||
now = time.time()
|
||||
hour_ago = now - 3600
|
||||
|
||||
rate_limits["key"][public_key] = [t for t in rate_limits["key"][public_key] if t > hour_ago]
|
||||
|
||||
if len(rate_limits["key"][public_key]) >= RATE_LIMIT_KEY:
|
||||
return False
|
||||
|
||||
rate_limits["key"][public_key].append(now)
|
||||
return True
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# MODELS
|
||||
# =========================================================================
|
||||
|
||||
class UploadInitRequest(BaseModel):
|
||||
hash: str
|
||||
size: int
|
||||
mime_type: str
|
||||
filename: str
|
||||
user_id: str
|
||||
password: Optional[str] = None
|
||||
|
||||
|
||||
class UploadInitResponse(BaseModel):
|
||||
status: str
|
||||
presigned_url: Optional[str] = None
|
||||
deduplicated: bool = False
|
||||
public_key: Optional[str] = None
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# UPLOAD ENDPOINTS
|
||||
# =========================================================================
|
||||
|
||||
@app.post("/upload/init", response_model=UploadInitResponse)
|
||||
async def upload_init(req: UploadInitRequest, request: Request, background_tasks: BackgroundTasks):
|
||||
"""
|
||||
Iniciar upload. Devuelve presigned URL o confirma deduplicación.
|
||||
"""
|
||||
client_ip = request.client.host
|
||||
|
||||
if not check_rate_limit_ip(client_ip):
|
||||
raise HTTPException(429, "Rate limit exceeded")
|
||||
|
||||
async with db_pool.acquire() as conn:
|
||||
# Verificar si blob ya existe
|
||||
blob = await conn.fetchrow("""
|
||||
SELECT content_hash, verification_status
|
||||
FROM storage.physical_blobs
|
||||
WHERE content_hash = $1
|
||||
""", req.hash)
|
||||
|
||||
if blob:
|
||||
if blob["verification_status"] == "VERIFIED":
|
||||
# Deduplicación: crear asset sin subir
|
||||
public_key = hashlib.sha256(
|
||||
f"{req.hash}{req.user_id}{datetime.now().isoformat()}".encode()
|
||||
).hexdigest()
|
||||
|
||||
password_hash = None
|
||||
if req.password:
|
||||
password_hash = ph.hash(req.password)
|
||||
|
||||
await conn.execute("""
|
||||
INSERT INTO storage.user_assets
|
||||
(public_key, blob_hash, user_id, original_filename, access_password)
|
||||
VALUES ($1, $2, $3, $4, $5)
|
||||
""", public_key, req.hash, req.user_id, req.filename, password_hash)
|
||||
|
||||
return UploadInitResponse(
|
||||
status="created",
|
||||
deduplicated=True,
|
||||
public_key=public_key
|
||||
)
|
||||
|
||||
# Blob existe pero PENDING - cliente debe subir de todas formas
|
||||
|
||||
else:
|
||||
# Crear registro PENDING
|
||||
storage_path = f"{req.hash}.bin"
|
||||
await conn.execute("""
|
||||
INSERT INTO storage.physical_blobs
|
||||
(content_hash, file_size, mime_type, storage_provider, storage_path)
|
||||
VALUES ($1, $2, $3, 'R2_PRIMARY', $4)
|
||||
""", req.hash, req.size, req.mime_type, storage_path)
|
||||
|
||||
# Generar presigned URL para upload
|
||||
storage_path = f"{req.hash}.bin"
|
||||
presigned_url = s3_client.generate_presigned_url(
|
||||
"put_object",
|
||||
Params={
|
||||
"Bucket": R2_BUCKET,
|
||||
"Key": storage_path,
|
||||
"ContentType": req.mime_type
|
||||
},
|
||||
ExpiresIn=PRESIGNED_UPLOAD_EXPIRY
|
||||
)
|
||||
|
||||
return UploadInitResponse(
|
||||
status="upload_required",
|
||||
presigned_url=presigned_url,
|
||||
deduplicated=False
|
||||
)
|
||||
|
||||
|
||||
@app.post("/upload/complete/{content_hash}")
|
||||
async def upload_complete(
|
||||
content_hash: str,
|
||||
user_id: str = Query(...),
|
||||
filename: str = Query(...),
|
||||
password: Optional[str] = Query(None),
|
||||
background_tasks: BackgroundTasks = None
|
||||
):
|
||||
"""
|
||||
Confirmar upload completado. Encola verificación.
|
||||
"""
|
||||
async with db_pool.acquire() as conn:
|
||||
blob = await conn.fetchrow("""
|
||||
SELECT content_hash, storage_path
|
||||
FROM storage.physical_blobs
|
||||
WHERE content_hash = $1
|
||||
""", content_hash)
|
||||
|
||||
if not blob:
|
||||
raise HTTPException(404, "Blob not found")
|
||||
|
||||
# Encolar verificación en background
|
||||
# En producción esto iría a una cola (Redis, RabbitMQ, etc.)
|
||||
background_tasks.add_task(
|
||||
verify_and_finalize,
|
||||
content_hash,
|
||||
blob["storage_path"],
|
||||
user_id,
|
||||
filename,
|
||||
password
|
||||
)
|
||||
|
||||
return {"status": "processing", "content_hash": content_hash}
|
||||
|
||||
|
||||
async def verify_and_finalize(
|
||||
content_hash: str,
|
||||
storage_path: str,
|
||||
user_id: str,
|
||||
filename: str,
|
||||
password: Optional[str]
|
||||
):
|
||||
"""Background task para verificar y finalizar upload"""
|
||||
from storage_worker import StorageWorker
|
||||
|
||||
worker = StorageWorker()
|
||||
await worker.init()
|
||||
|
||||
try:
|
||||
result = await worker.process_upload(
|
||||
content_hash,
|
||||
storage_path,
|
||||
user_id,
|
||||
filename,
|
||||
ph.hash(password) if password else None
|
||||
)
|
||||
# En producción: notificar cliente via webhook/websocket
|
||||
print(f"Upload finalized: {result}")
|
||||
finally:
|
||||
await worker.close()
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# DOWNLOAD ENDPOINTS
|
||||
# =========================================================================
|
||||
|
||||
@app.get("/file/{public_key}")
|
||||
async def download_file(
|
||||
public_key: str,
|
||||
request: Request,
|
||||
password: Optional[str] = Query(None)
|
||||
):
|
||||
"""
|
||||
Descarga de archivo. Devuelve redirect a URL firmada.
|
||||
"""
|
||||
client_ip = request.client.host
|
||||
|
||||
# Rate limiting
|
||||
if not check_rate_limit_ip(client_ip):
|
||||
raise HTTPException(429, "Rate limit exceeded - IP")
|
||||
|
||||
if not check_rate_limit_key(public_key):
|
||||
raise HTTPException(429, "Rate limit exceeded - downloads")
|
||||
|
||||
async with db_pool.acquire() as conn:
|
||||
# Buscar asset
|
||||
asset = await conn.fetchrow("""
|
||||
SELECT a.id, a.blob_hash, a.original_filename, a.access_password, a.downloads_count,
|
||||
b.storage_provider, b.storage_path, b.verification_status, b.mime_type
|
||||
FROM storage.user_assets a
|
||||
JOIN storage.physical_blobs b ON a.blob_hash = b.content_hash
|
||||
WHERE a.public_key = $1
|
||||
""", public_key)
|
||||
|
||||
if not asset:
|
||||
raise HTTPException(404, "Asset not found")
|
||||
|
||||
# Verificar contraseña si requerida
|
||||
if asset["access_password"]:
|
||||
if not password:
|
||||
raise HTTPException(401, "Password required")
|
||||
try:
|
||||
ph.verify(asset["access_password"], password)
|
||||
except:
|
||||
raise HTTPException(401, "Invalid password")
|
||||
|
||||
# Verificar estado del blob
|
||||
status = asset["verification_status"]
|
||||
|
||||
if status == "PENDING":
|
||||
raise HTTPException(202, "File is being processed")
|
||||
|
||||
if status in ("CORRUPT", "LOST"):
|
||||
raise HTTPException(410, "File is no longer available")
|
||||
|
||||
# Incrementar contador de descargas
|
||||
await conn.execute("""
|
||||
UPDATE storage.user_assets
|
||||
SET downloads_count = downloads_count + 1
|
||||
WHERE id = $1
|
||||
""", asset["id"])
|
||||
|
||||
# Generar URL firmada según provider
|
||||
provider = asset["storage_provider"]
|
||||
|
||||
if provider in ("R2_PRIMARY", "R2_CACHE"):
|
||||
presigned_url = s3_client.generate_presigned_url(
|
||||
"get_object",
|
||||
Params={
|
||||
"Bucket": R2_BUCKET,
|
||||
"Key": asset["storage_path"],
|
||||
"ResponseContentDisposition": f'attachment; filename="{asset["original_filename"]}"',
|
||||
"ResponseContentType": asset["mime_type"]
|
||||
},
|
||||
ExpiresIn=PRESIGNED_DOWNLOAD_EXPIRY
|
||||
)
|
||||
return RedirectResponse(presigned_url, status_code=302)
|
||||
|
||||
elif provider == "SHAREPOINT":
|
||||
# TODO: Implementar acceso SharePoint via Graph API
|
||||
raise HTTPException(503, "SharePoint access not implemented")
|
||||
|
||||
else:
|
||||
raise HTTPException(503, "Unknown storage provider")
|
||||
|
||||
|
||||
@app.get("/file/{public_key}/info")
|
||||
async def file_info(public_key: str, request: Request):
|
||||
"""
|
||||
Información del archivo sin descargarlo.
|
||||
"""
|
||||
client_ip = request.client.host
|
||||
|
||||
if not check_rate_limit_ip(client_ip):
|
||||
raise HTTPException(429, "Rate limit exceeded")
|
||||
|
||||
async with db_pool.acquire() as conn:
|
||||
asset = await conn.fetchrow("""
|
||||
SELECT a.public_key, a.original_filename, a.downloads_count, a.created_at,
|
||||
b.file_size, b.mime_type, b.verification_status,
|
||||
(a.access_password IS NOT NULL) as password_protected
|
||||
FROM storage.user_assets a
|
||||
JOIN storage.physical_blobs b ON a.blob_hash = b.content_hash
|
||||
WHERE a.public_key = $1
|
||||
""", public_key)
|
||||
|
||||
if not asset:
|
||||
raise HTTPException(404, "Asset not found")
|
||||
|
||||
return {
|
||||
"public_key": asset["public_key"],
|
||||
"filename": asset["original_filename"],
|
||||
"size": asset["file_size"],
|
||||
"mime_type": asset["mime_type"],
|
||||
"status": asset["verification_status"],
|
||||
"downloads": asset["downloads_count"],
|
||||
"password_protected": asset["password_protected"],
|
||||
"created_at": asset["created_at"].isoformat()
|
||||
}
|
||||
|
||||
|
||||
@app.get("/file/{public_key}/thumb")
|
||||
async def file_thumbnail(public_key: str, request: Request):
|
||||
"""
|
||||
Redirect al thumbnail del archivo.
|
||||
"""
|
||||
client_ip = request.client.host
|
||||
|
||||
if not check_rate_limit_ip(client_ip):
|
||||
raise HTTPException(429, "Rate limit exceeded")
|
||||
|
||||
async with db_pool.acquire() as conn:
|
||||
asset = await conn.fetchrow("""
|
||||
SELECT a.blob_hash, b.verification_status
|
||||
FROM storage.user_assets a
|
||||
JOIN storage.physical_blobs b ON a.blob_hash = b.content_hash
|
||||
WHERE a.public_key = $1
|
||||
""", public_key)
|
||||
|
||||
if not asset:
|
||||
raise HTTPException(404, "Asset not found")
|
||||
|
||||
if asset["verification_status"] != "VERIFIED":
|
||||
raise HTTPException(202, "Thumbnail not ready")
|
||||
|
||||
# URL al thumbnail
|
||||
thumb_key = f"{asset['blob_hash']}.thumb"
|
||||
|
||||
try:
|
||||
# Verificar que existe
|
||||
s3_client.head_object(Bucket=R2_BUCKET, Key=thumb_key)
|
||||
except:
|
||||
raise HTTPException(404, "Thumbnail not available")
|
||||
|
||||
presigned_url = s3_client.generate_presigned_url(
|
||||
"get_object",
|
||||
Params={"Bucket": R2_BUCKET, "Key": thumb_key},
|
||||
ExpiresIn=PRESIGNED_DOWNLOAD_EXPIRY
|
||||
)
|
||||
|
||||
return RedirectResponse(presigned_url, status_code=302)
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# HEALTH
|
||||
# =========================================================================
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {"status": "ok", "timestamp": datetime.now().isoformat()}
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# MAIN
|
||||
# =========================================================================
|
||||
|
||||
if __name__ == "__main__":
|
||||
uvicorn.run(app, host="0.0.0.0", port=8080)
|
||||
480
apps/storage/storage_worker.py
Normal file
480
apps/storage/storage_worker.py
Normal file
@@ -0,0 +1,480 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Storage Worker - Verificación y procesamiento de archivos
|
||||
Spec: Sistema de Almacenamiento Híbrido v4.0
|
||||
"""
|
||||
|
||||
import os
|
||||
import hashlib
|
||||
import json
|
||||
import asyncio
|
||||
import asyncpg
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, Any
|
||||
import boto3
|
||||
from PIL import Image
|
||||
import fitz # PyMuPDF
|
||||
import io
|
||||
import tempfile
|
||||
|
||||
# Configuración
|
||||
R2_ENDPOINT = os.environ.get("R2_ENDPOINT", "https://7dedae6030f5554d99d37e98a5232996.r2.cloudflarestorage.com")
|
||||
R2_BUCKET = os.environ.get("R2_BUCKET", "deck")
|
||||
DB_URL = os.environ.get("DATABASE_URL", "postgresql://tzzr:tzzr@localhost:5432/tzzr")
|
||||
|
||||
THUMB_WIDTH = 300
|
||||
MAX_RETRIES = 9
|
||||
RETRY_BACKOFF_BASE = 2
|
||||
|
||||
|
||||
def get_s3_client():
|
||||
return boto3.client(
|
||||
"s3",
|
||||
endpoint_url=R2_ENDPOINT,
|
||||
aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"),
|
||||
aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"),
|
||||
)
|
||||
|
||||
|
||||
async def get_db_pool():
|
||||
return await asyncpg.create_pool(DB_URL, min_size=2, max_size=10)
|
||||
|
||||
|
||||
def calculate_sha256(data: bytes) -> str:
|
||||
"""Calcula SHA-256 de bytes"""
|
||||
return hashlib.sha256(data).hexdigest()
|
||||
|
||||
|
||||
def generate_public_key(content_hash: str, user_id: str) -> str:
|
||||
"""Genera public_key única para un asset"""
|
||||
data = f"{content_hash}{user_id}{datetime.now().isoformat()}"
|
||||
return hashlib.sha256(data.encode()).hexdigest()
|
||||
|
||||
|
||||
class StorageWorker:
|
||||
def __init__(self):
|
||||
self.s3 = get_s3_client()
|
||||
self.pool = None
|
||||
|
||||
async def init(self):
|
||||
self.pool = await get_db_pool()
|
||||
|
||||
async def close(self):
|
||||
if self.pool:
|
||||
await self.pool.close()
|
||||
|
||||
# =========================================================================
|
||||
# VERIFICACIÓN DE HASH
|
||||
# =========================================================================
|
||||
|
||||
async def verify_blob(self, declared_hash: str, storage_path: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Verifica que el hash declarado coincida con el contenido real.
|
||||
NUNCA confiamos en el hash del cliente.
|
||||
"""
|
||||
try:
|
||||
# Descargar archivo
|
||||
obj = self.s3.get_object(Bucket=R2_BUCKET, Key=storage_path)
|
||||
content = obj["Body"].read()
|
||||
|
||||
# Calcular hash real
|
||||
calculated_hash = calculate_sha256(content)
|
||||
|
||||
if calculated_hash != declared_hash:
|
||||
# HASH MISMATCH - Archivo corrupto o spoofing
|
||||
await self._mark_corrupt(declared_hash, storage_path)
|
||||
return {
|
||||
"status": "CORRUPT",
|
||||
"declared": declared_hash,
|
||||
"calculated": calculated_hash,
|
||||
"action": "deleted"
|
||||
}
|
||||
|
||||
# Hash coincide - Marcar como verificado
|
||||
await self._mark_verified(declared_hash)
|
||||
|
||||
return {
|
||||
"status": "VERIFIED",
|
||||
"hash": declared_hash,
|
||||
"size": len(content)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {"status": "ERROR", "error": str(e)}
|
||||
|
||||
async def _mark_corrupt(self, content_hash: str, storage_path: str):
|
||||
"""Marca blob como corrupto y elimina archivo"""
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
UPDATE storage.physical_blobs
|
||||
SET verification_status = 'CORRUPT', updated_at = NOW()
|
||||
WHERE content_hash = $1
|
||||
""", content_hash)
|
||||
|
||||
# Eliminar archivo del bucket
|
||||
try:
|
||||
self.s3.delete_object(Bucket=R2_BUCKET, Key=storage_path)
|
||||
except:
|
||||
pass
|
||||
|
||||
async def _mark_verified(self, content_hash: str):
|
||||
"""Marca blob como verificado"""
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
UPDATE storage.physical_blobs
|
||||
SET verification_status = 'VERIFIED',
|
||||
last_verified_at = NOW(),
|
||||
updated_at = NOW()
|
||||
WHERE content_hash = $1
|
||||
""", content_hash)
|
||||
|
||||
# =========================================================================
|
||||
# GENERACIÓN DE DERIVADOS
|
||||
# =========================================================================
|
||||
|
||||
async def generate_derivatives(self, content_hash: str) -> Dict[str, Any]:
|
||||
"""Genera thumbnail y metadatos para un blob verificado"""
|
||||
async with self.pool.acquire() as conn:
|
||||
blob = await conn.fetchrow("""
|
||||
SELECT content_hash, mime_type, storage_path, file_size
|
||||
FROM storage.physical_blobs
|
||||
WHERE content_hash = $1 AND verification_status = 'VERIFIED'
|
||||
""", content_hash)
|
||||
|
||||
if not blob:
|
||||
return {"status": "ERROR", "error": "Blob not found or not verified"}
|
||||
|
||||
mime_type = blob["mime_type"]
|
||||
storage_path = blob["storage_path"]
|
||||
|
||||
# Descargar archivo
|
||||
obj = self.s3.get_object(Bucket=R2_BUCKET, Key=storage_path)
|
||||
content = obj["Body"].read()
|
||||
|
||||
metadata = {
|
||||
"content_hash": content_hash,
|
||||
"mime_type": mime_type,
|
||||
"file_size": blob["file_size"],
|
||||
"processed_at": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
thumb_generated = False
|
||||
|
||||
# Generar thumbnail según tipo
|
||||
if mime_type.startswith("image/"):
|
||||
thumb_data, extra_meta = self._process_image(content)
|
||||
metadata.update(extra_meta)
|
||||
if thumb_data:
|
||||
await self._save_thumb(content_hash, thumb_data)
|
||||
thumb_generated = True
|
||||
|
||||
elif mime_type == "application/pdf":
|
||||
thumb_data, extra_meta = self._process_pdf(content)
|
||||
metadata.update(extra_meta)
|
||||
if thumb_data:
|
||||
await self._save_thumb(content_hash, thumb_data)
|
||||
thumb_generated = True
|
||||
|
||||
# Guardar metadatos
|
||||
await self._save_metadata(content_hash, metadata)
|
||||
|
||||
return {
|
||||
"status": "OK",
|
||||
"thumb_generated": thumb_generated,
|
||||
"metadata": metadata
|
||||
}
|
||||
|
||||
def _process_image(self, content: bytes) -> tuple:
|
||||
"""Procesa imagen: genera thumb y extrae metadatos"""
|
||||
try:
|
||||
img = Image.open(io.BytesIO(content))
|
||||
|
||||
# Metadatos
|
||||
meta = {
|
||||
"width": img.width,
|
||||
"height": img.height,
|
||||
"format": img.format,
|
||||
"mode": img.mode
|
||||
}
|
||||
|
||||
# EXIF si disponible
|
||||
if hasattr(img, '_getexif') and img._getexif():
|
||||
meta["has_exif"] = True
|
||||
|
||||
# Generar thumbnail
|
||||
ratio = THUMB_WIDTH / img.width
|
||||
new_height = int(img.height * ratio)
|
||||
thumb = img.copy()
|
||||
thumb.thumbnail((THUMB_WIDTH, new_height), Image.Resampling.LANCZOS)
|
||||
|
||||
# Convertir a bytes
|
||||
thumb_buffer = io.BytesIO()
|
||||
thumb.save(thumb_buffer, format="JPEG", quality=85)
|
||||
thumb_data = thumb_buffer.getvalue()
|
||||
|
||||
return thumb_data, meta
|
||||
|
||||
except Exception as e:
|
||||
return None, {"error": str(e)}
|
||||
|
||||
def _process_pdf(self, content: bytes) -> tuple:
|
||||
"""Procesa PDF: genera thumb de primera página y extrae metadatos"""
|
||||
try:
|
||||
doc = fitz.open(stream=content, filetype="pdf")
|
||||
|
||||
meta = {
|
||||
"pages": len(doc),
|
||||
"format": "PDF"
|
||||
}
|
||||
|
||||
# Metadatos del documento
|
||||
pdf_meta = doc.metadata
|
||||
if pdf_meta:
|
||||
if pdf_meta.get("author"):
|
||||
meta["author"] = pdf_meta["author"]
|
||||
if pdf_meta.get("title"):
|
||||
meta["title"] = pdf_meta["title"]
|
||||
|
||||
# Render primera página como thumbnail
|
||||
if len(doc) > 0:
|
||||
page = doc[0]
|
||||
# Escalar para que el ancho sea THUMB_WIDTH
|
||||
zoom = THUMB_WIDTH / page.rect.width
|
||||
mat = fitz.Matrix(zoom, zoom)
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
thumb_data = pix.tobytes("jpeg")
|
||||
else:
|
||||
thumb_data = None
|
||||
|
||||
doc.close()
|
||||
return thumb_data, meta
|
||||
|
||||
except Exception as e:
|
||||
return None, {"error": str(e)}
|
||||
|
||||
async def _save_thumb(self, content_hash: str, thumb_data: bytes):
|
||||
"""Guarda thumbnail en el bucket"""
|
||||
key = f"{content_hash}.thumb"
|
||||
self.s3.put_object(
|
||||
Bucket=R2_BUCKET,
|
||||
Key=key,
|
||||
Body=thumb_data,
|
||||
ContentType="image/jpeg"
|
||||
)
|
||||
|
||||
async def _save_metadata(self, content_hash: str, metadata: dict):
|
||||
"""Guarda metadatos JSON en el bucket"""
|
||||
key = f"{content_hash}.json"
|
||||
self.s3.put_object(
|
||||
Bucket=R2_BUCKET,
|
||||
Key=key,
|
||||
Body=json.dumps(metadata, indent=2),
|
||||
ContentType="application/json"
|
||||
)
|
||||
|
||||
# =========================================================================
|
||||
# PROCESAMIENTO COMPLETO
|
||||
# =========================================================================
|
||||
|
||||
async def process_upload(
|
||||
self,
|
||||
declared_hash: str,
|
||||
storage_path: str,
|
||||
user_id: str,
|
||||
original_filename: str,
|
||||
access_password: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Proceso completo post-upload:
|
||||
1. Verificar hash
|
||||
2. Generar derivados
|
||||
3. Crear user_asset
|
||||
"""
|
||||
# 1. Verificar hash
|
||||
verify_result = await self.verify_blob(declared_hash, storage_path)
|
||||
|
||||
if verify_result["status"] != "VERIFIED":
|
||||
return verify_result
|
||||
|
||||
# 2. Generar derivados (con reintentos)
|
||||
for attempt in range(MAX_RETRIES):
|
||||
try:
|
||||
deriv_result = await self.generate_derivatives(declared_hash)
|
||||
if deriv_result["status"] == "OK":
|
||||
break
|
||||
except Exception as e:
|
||||
if attempt == MAX_RETRIES - 1:
|
||||
# Último intento fallido, pero blob ya está verificado
|
||||
deriv_result = {"status": "PARTIAL", "error": str(e)}
|
||||
else:
|
||||
await asyncio.sleep(RETRY_BACKOFF_BASE ** attempt)
|
||||
|
||||
# 3. Crear user_asset
|
||||
public_key = generate_public_key(declared_hash, user_id)
|
||||
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
INSERT INTO storage.user_assets
|
||||
(public_key, blob_hash, user_id, original_filename, access_password)
|
||||
VALUES ($1, $2, $3, $4, $5)
|
||||
""", public_key, declared_hash, user_id, original_filename, access_password)
|
||||
|
||||
return {
|
||||
"status": "CREATED",
|
||||
"public_key": public_key,
|
||||
"content_hash": declared_hash,
|
||||
"derivatives": deriv_result
|
||||
}
|
||||
|
||||
# =========================================================================
|
||||
# REGISTRO DE BLOB (sin subida - para archivos existentes)
|
||||
# =========================================================================
|
||||
|
||||
async def register_blob(
|
||||
self,
|
||||
content_hash: str,
|
||||
file_size: int,
|
||||
mime_type: str,
|
||||
storage_provider: str,
|
||||
storage_path: str
|
||||
) -> Dict[str, Any]:
|
||||
"""Registra un blob existente en el sistema"""
|
||||
async with self.pool.acquire() as conn:
|
||||
# Verificar si ya existe
|
||||
existing = await conn.fetchrow("""
|
||||
SELECT content_hash, verification_status
|
||||
FROM storage.physical_blobs
|
||||
WHERE content_hash = $1
|
||||
""", content_hash)
|
||||
|
||||
if existing:
|
||||
return {
|
||||
"status": "EXISTS",
|
||||
"content_hash": content_hash,
|
||||
"verification_status": existing["verification_status"]
|
||||
}
|
||||
|
||||
# Insertar nuevo blob
|
||||
await conn.execute("""
|
||||
INSERT INTO storage.physical_blobs
|
||||
(content_hash, file_size, mime_type, storage_provider, storage_path)
|
||||
VALUES ($1, $2, $3, $4::storage.storage_provider_enum, $5)
|
||||
""", content_hash, file_size, mime_type, storage_provider, storage_path)
|
||||
|
||||
return {
|
||||
"status": "REGISTERED",
|
||||
"content_hash": content_hash,
|
||||
"verification_status": "PENDING"
|
||||
}
|
||||
|
||||
# =========================================================================
|
||||
# MANTENIMIENTO
|
||||
# =========================================================================
|
||||
|
||||
async def garbage_collect(self, dry_run: bool = True) -> Dict[str, Any]:
|
||||
"""
|
||||
Elimina blobs huérfanos (ref_count = 0, sin actualizar en 30 días)
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
orphans = await conn.fetch("""
|
||||
SELECT content_hash, storage_path
|
||||
FROM storage.physical_blobs
|
||||
WHERE ref_count = 0
|
||||
AND updated_at < NOW() - INTERVAL '30 days'
|
||||
""")
|
||||
|
||||
deleted = []
|
||||
for blob in orphans:
|
||||
if not dry_run:
|
||||
# Eliminar derivados
|
||||
for ext in [".thumb", ".json"]:
|
||||
try:
|
||||
self.s3.delete_object(Bucket=R2_BUCKET, Key=f"{blob['content_hash']}{ext}")
|
||||
except:
|
||||
pass
|
||||
|
||||
# Eliminar blob
|
||||
try:
|
||||
self.s3.delete_object(Bucket=R2_BUCKET, Key=blob["storage_path"])
|
||||
except:
|
||||
pass
|
||||
|
||||
# Eliminar registro
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
DELETE FROM storage.physical_blobs WHERE content_hash = $1
|
||||
""", blob["content_hash"])
|
||||
|
||||
deleted.append(blob["content_hash"])
|
||||
|
||||
return {
|
||||
"status": "OK",
|
||||
"dry_run": dry_run,
|
||||
"orphans_found": len(orphans),
|
||||
"deleted": deleted if not dry_run else []
|
||||
}
|
||||
|
||||
async def integrity_check(self, sample_percent: float = 0.01) -> Dict[str, Any]:
|
||||
"""
|
||||
Verifica integridad de una muestra aleatoria de blobs
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
blobs = await conn.fetch("""
|
||||
SELECT content_hash, storage_path
|
||||
FROM storage.physical_blobs
|
||||
WHERE verification_status = 'VERIFIED'
|
||||
ORDER BY RANDOM()
|
||||
LIMIT (SELECT CEIL(COUNT(*) * $1) FROM storage.physical_blobs WHERE verification_status = 'VERIFIED')
|
||||
""", sample_percent)
|
||||
|
||||
results = {"checked": 0, "ok": 0, "corrupt": []}
|
||||
|
||||
for blob in blobs:
|
||||
results["checked"] += 1
|
||||
verify = await self.verify_blob(blob["content_hash"], blob["storage_path"])
|
||||
|
||||
if verify["status"] == "VERIFIED":
|
||||
results["ok"] += 1
|
||||
else:
|
||||
results["corrupt"].append(blob["content_hash"])
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# CLI para pruebas
|
||||
async def main():
|
||||
import sys
|
||||
|
||||
worker = StorageWorker()
|
||||
await worker.init()
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: storage_worker.py <command> [args]")
|
||||
print("Commands: gc, integrity, register")
|
||||
return
|
||||
|
||||
cmd = sys.argv[1]
|
||||
|
||||
if cmd == "gc":
|
||||
dry_run = "--execute" not in sys.argv
|
||||
result = await worker.garbage_collect(dry_run=dry_run)
|
||||
print(json.dumps(result, indent=2))
|
||||
|
||||
elif cmd == "integrity":
|
||||
result = await worker.integrity_check()
|
||||
print(json.dumps(result, indent=2))
|
||||
|
||||
elif cmd == "register":
|
||||
if len(sys.argv) < 6:
|
||||
print("Usage: storage_worker.py register <hash> <size> <mime> <path>")
|
||||
return
|
||||
result = await worker.register_blob(
|
||||
sys.argv[2], int(sys.argv[3]), sys.argv[4], "R2_PRIMARY", sys.argv[5]
|
||||
)
|
||||
print(json.dumps(result, indent=2))
|
||||
|
||||
await worker.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
130
apps/storage/sync_metadata.py
Normal file
130
apps/storage/sync_metadata.py
Normal file
@@ -0,0 +1,130 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Sincronizar metadata desde JSON del bucket R2 a storage.physical_blobs
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import boto3
|
||||
import asyncio
|
||||
import asyncpg
|
||||
|
||||
R2_ENDPOINT = "https://7dedae6030f5554d99d37e98a5232996.r2.cloudflarestorage.com"
|
||||
R2_BUCKET = "deck"
|
||||
|
||||
|
||||
def get_s3_client():
|
||||
return boto3.client(
|
||||
"s3",
|
||||
endpoint_url=R2_ENDPOINT,
|
||||
aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"),
|
||||
aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"),
|
||||
)
|
||||
|
||||
|
||||
async def sync():
|
||||
s3 = get_s3_client()
|
||||
pool = await asyncpg.create_pool(
|
||||
"postgresql:///tzzr?host=/var/run/postgresql",
|
||||
min_size=2, max_size=10
|
||||
)
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
blobs = await conn.fetch("""
|
||||
SELECT content_hash, storage_path
|
||||
FROM storage.physical_blobs
|
||||
WHERE file_size = 0
|
||||
""")
|
||||
|
||||
print(f"Sincronizando metadata para {len(blobs)} blobs...")
|
||||
|
||||
updated = 0
|
||||
errors = 0
|
||||
|
||||
for blob in blobs:
|
||||
hash = blob["content_hash"]
|
||||
json_key = f"{hash}.json"
|
||||
|
||||
try:
|
||||
obj = s3.get_object(Bucket=R2_BUCKET, Key=json_key)
|
||||
meta = json.loads(obj["Body"].read())
|
||||
|
||||
# Extraer datos
|
||||
l2 = meta.get("jsonb_standard", {}).get("L2_document", {})
|
||||
size_bytes = l2.get("size_bytes", 0)
|
||||
mime_type = l2.get("mime_type")
|
||||
ext = meta.get("ext", "pdf")
|
||||
url_atc = meta.get("url_atc", [])
|
||||
storage_path = url_atc[0] if url_atc else f"{hash}.{ext}"
|
||||
|
||||
if not mime_type:
|
||||
if ext == "pdf":
|
||||
mime_type = "application/pdf"
|
||||
elif ext in ("jpg", "jpeg"):
|
||||
mime_type = "image/jpeg"
|
||||
elif ext == "png":
|
||||
mime_type = "image/png"
|
||||
else:
|
||||
mime_type = "application/octet-stream"
|
||||
|
||||
# Obtener size real del archivo si no está en JSON
|
||||
if size_bytes == 0:
|
||||
try:
|
||||
file_obj = s3.head_object(Bucket=R2_BUCKET, Key=storage_path)
|
||||
size_bytes = file_obj.get("ContentLength", 0)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Actualizar registro
|
||||
await conn.execute("""
|
||||
UPDATE storage.physical_blobs
|
||||
SET file_size = $2,
|
||||
mime_type = $3,
|
||||
storage_path = $4
|
||||
WHERE content_hash = $1
|
||||
""", hash, size_bytes, mime_type, storage_path)
|
||||
|
||||
updated += 1
|
||||
|
||||
if updated % 100 == 0:
|
||||
print(f" Actualizados: {updated}")
|
||||
|
||||
except s3.exceptions.NoSuchKey:
|
||||
# JSON no existe, intentar obtener size del archivo directamente
|
||||
try:
|
||||
# Probar diferentes extensiones
|
||||
for ext in ["pdf", "png", "jpg"]:
|
||||
try:
|
||||
file_key = f"{hash}.{ext}"
|
||||
file_obj = s3.head_object(Bucket=R2_BUCKET, Key=file_key)
|
||||
size_bytes = file_obj.get("ContentLength", 0)
|
||||
content_type = file_obj.get("ContentType", "application/octet-stream")
|
||||
|
||||
await conn.execute("""
|
||||
UPDATE storage.physical_blobs
|
||||
SET file_size = $2,
|
||||
mime_type = $3,
|
||||
storage_path = $4
|
||||
WHERE content_hash = $1
|
||||
""", hash, size_bytes, content_type, file_key)
|
||||
|
||||
updated += 1
|
||||
break
|
||||
except:
|
||||
continue
|
||||
except Exception as e:
|
||||
errors += 1
|
||||
|
||||
except Exception as e:
|
||||
errors += 1
|
||||
print(f"Error en {hash}: {e}")
|
||||
|
||||
print(f"\nSincronización completada:")
|
||||
print(f" - Actualizados: {updated}")
|
||||
print(f" - Errores: {errors}")
|
||||
|
||||
await pool.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(sync())
|
||||
Reference in New Issue
Block a user