fix(traefik): increase read-timeout to avoid crashing ocis for large uploads

Traefik's default readTimeout of 60s killing the upload connection. The cascade was: 1. Large upload exceeds 60s → Traefik kills connection 2. storageusers floods with NetworkTimeoutError 3. Aborted uploads generate tons of NATS events 4. NATS gets overwhelmed → no response from stream 5. Proxy can't resolve user roles → login returns 500
2026-04-12 18:49:02 +02:00 · 2026-04-12 18:49:02 +02:00 · 88fa8c4df3
parent f57d29d1d3
commit 88fa8c4df3
2 changed files with 8 additions and 0 deletions
--- a/prototypes/traefik/helm/traefik.yaml
+++ b/prototypes/traefik/helm/traefik.yaml
@ -11,8 +11,14 @@ deployment:
 ports:
  web:
    hostPort: 80
+    transport:
+      respondingTimeouts:
+        readTimeout: 600s
  websecure:
    hostPort: 443
+    transport:
+      respondingTimeouts:
+        readTimeout: 600s
  metrics:
    expose:
      default: true
--- a/rendered/envs/production/traefik/daemonset-traefik.yaml
+++ b/rendered/envs/production/traefik/daemonset-traefik.yaml
@ -44,7 +44,9 @@ spec:
            - --providers.kubernetesingress
            - --providers.kubernetesingress.allowEmptyServices=true
            - --providers.kubernetesingress.ingressendpoint.publishedservice=traefik/traefik
+            - --entryPoints.web.transport.respondingTimeouts.readTimeout=600s
            - --entryPoints.websecure.http.tls=true
+            - --entryPoints.websecure.transport.respondingTimeouts.readTimeout=600s
            - --log.level=INFO
          env:
            - name: POD_NAME