From c12d24949c4acfcfe53a2984cf6d67968f609132 Mon Sep 17 00:00:00 2001
From: Marcin Rataj <lidel@lidel.org>
Date: Thu, 21 Aug 2025 14:44:19 +0200
Subject: [PATCH] feat: optimize docker builds (#10925)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(docker): optimize Dockerfile for faster builds

- add BuildKit syntax directive for advanced caching features
- implement cache mounts for Go modules and build cache
- reduce layers by combining RUN commands (5→2 in final stage)
- optimize apt-get with --no-install-recommends flag
- use COPY --chmod to avoid separate permission fixing

Performance improvements:
- incremental builds after code changes: ~8.6x faster (1m51s → 13s)
- go module/build cache persists between builds
- reduced layer count improves cache efficiency

* ci: optimize Docker builds with BuildKit caching

- enable BuildKit with GitHub Actions cache backend
- add Docker Hub registry cache for cross-workflow sharing
- move Docker login earlier to enable registry cache writes
- use dual cache strategy (gha + registry) for faster builds

expected improvements:
- PR builds can reuse main branch cache from Docker Hub
- rebuild after code changes ~5-10x faster with persistent cache
- cross-PR cache sharing reduces redundant builds
---
 .github/workflows/docker-build.yml | 21 ++++++--
 .github/workflows/docker-image.yml | 49 ++++++++----------
 Dockerfile                         | 80 +++++++++++++-----------------
 3 files changed, 74 insertions(+), 76 deletions(-)

diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
index bd1a5cf41..24ece3fa4 100644
--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@@ -27,8 +27,21 @@ jobs:
         shell: bash
     steps:
       - uses: actions/checkout@v5
-      - uses: actions/setup-go@v5
+      
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      
+      - name: Build Docker image with BuildKit
+        uses: docker/build-push-action@v6
         with:
-          go-version: 1.25.x
-      - run: docker build -t $IMAGE_NAME:$WIP_IMAGE_TAG .
-      - run: docker run --rm $IMAGE_NAME:$WIP_IMAGE_TAG --version
+          context: .
+          push: false
+          load: true
+          tags: ${{ env.IMAGE_NAME }}:${{ env.WIP_IMAGE_TAG }}
+          cache-from: |
+            type=gha
+            type=registry,ref=${{ env.IMAGE_NAME }}:buildcache
+          cache-to: type=gha,mode=max
+      
+      - name: Test Docker image
+        run: docker run --rm $IMAGE_NAME:$WIP_IMAGE_TAG --version
diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
index 8aa0dd77b..6d89c2980 100644
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -46,13 +46,11 @@ jobs:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
 
-      - name: Cache Docker layers
-        uses: actions/cache@v4
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v3
         with:
-          path: /tmp/.buildx-cache
-          key: ${{ runner.os }}-buildx-${{ github.sha }}
-          restore-keys: |
-            ${{ runner.os }}-buildx-
+          username: ${{ vars.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
 
       - name: Get tags
         id: tags
@@ -63,12 +61,6 @@ jobs:
           echo "EOF" >> $GITHUB_OUTPUT
         shell: bash
 
-      - name: Log in to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ vars.DOCKER_USERNAME }}
-          password: ${{ secrets.DOCKER_PASSWORD }}
-
       # We have to build each platform separately because when using multi-arch
       # builds, only one platform is being loaded into the cache. This would
       # prevent us from testing the other platforms.
@@ -81,8 +73,10 @@ jobs:
           load: true
           file: ./Dockerfile
           tags: ${{ env.IMAGE_NAME }}:linux-amd64
-          cache-from: type=local,src=/tmp/.buildx-cache
-          cache-to: type=local,dest=/tmp/.buildx-cache-new
+          cache-from: |
+            type=gha
+            type=registry,ref=${{ env.IMAGE_NAME }}:buildcache
+          cache-to: type=gha,mode=max
 
       - name: Build Docker image (linux/arm/v7)
         uses: docker/build-push-action@v6
@@ -93,8 +87,10 @@ jobs:
           load: true
           file: ./Dockerfile
           tags: ${{ env.IMAGE_NAME }}:linux-arm-v7
-          cache-from: type=local,src=/tmp/.buildx-cache
-          cache-to: type=local,dest=/tmp/.buildx-cache-new
+          cache-from: |
+            type=gha
+            type=registry,ref=${{ env.IMAGE_NAME }}:buildcache
+          cache-to: type=gha,mode=max
 
       - name: Build Docker image (linux/arm64/v8)
         uses: docker/build-push-action@v6
@@ -105,8 +101,10 @@ jobs:
           load: true
           file: ./Dockerfile
           tags: ${{ env.IMAGE_NAME }}:linux-arm64-v8
-          cache-from: type=local,src=/tmp/.buildx-cache
-          cache-to: type=local,dest=/tmp/.buildx-cache-new
+          cache-from: |
+            type=gha
+            type=registry,ref=${{ env.IMAGE_NAME }}:buildcache
+          cache-to: type=gha,mode=max
 
       # We test all the images on amd64 host here. This uses QEMU to emulate
       # the other platforms.
@@ -132,12 +130,9 @@ jobs:
           push: true
           file: ./Dockerfile
           tags: "${{ github.event.inputs.tags || steps.tags.outputs.value }}"
-          cache-from: type=local,src=/tmp/.buildx-cache-new
-          cache-to: type=local,dest=/tmp/.buildx-cache-new
-
-      # https://github.com/docker/build-push-action/issues/252
-      # https://github.com/moby/buildkit/issues/1896
-      - name: Move cache to limit growth
-        run: |
-          rm -rf /tmp/.buildx-cache
-          mv /tmp/.buildx-cache-new /tmp/.buildx-cache
+          cache-from: |
+            type=gha
+            type=registry,ref=${{ env.IMAGE_NAME }}:buildcache
+          cache-to: |
+            type=gha,mode=max
+            type=registry,ref=${{ env.IMAGE_NAME }}:buildcache,mode=max
diff --git a/Dockerfile b/Dockerfile
index de66c7867..0db5f33b4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,12 +1,15 @@
+# syntax=docker/dockerfile:1
+# Enables BuildKit with cache mounts for faster builds
 FROM --platform=${BUILDPLATFORM:-linux/amd64} golang:1.25 AS builder
 
 ARG TARGETOS TARGETARCH
 
 ENV SRC_DIR=/kubo
 
-# Download packages first so they can be cached.
+# Cache go module downloads between builds for faster rebuilds
 COPY go.mod go.sum $SRC_DIR/
-RUN cd $SRC_DIR \
+RUN --mount=type=cache,target=/go/pkg/mod \
+  cd $SRC_DIR \
   && go mod download
 
 COPY . $SRC_DIR
@@ -18,92 +21,79 @@ ARG IPFS_PLUGINS
 # Allow for other targets to be built, e.g.: docker build --build-arg MAKE_TARGET="nofuse"
 ARG MAKE_TARGET=build
 
-# Build the thing.
-# Also: fix getting HEAD commit hash via git rev-parse.
-RUN cd $SRC_DIR \
+# Build ipfs binary with cached go modules and build cache.
+# mkdir .git/objects allows git rev-parse to read commit hash for version info
+RUN --mount=type=cache,target=/go/pkg/mod \
+  --mount=type=cache,target=/root/.cache/go-build \
+  cd $SRC_DIR \
   && mkdir -p .git/objects \
   && GOOS=$TARGETOS GOARCH=$TARGETARCH GOFLAGS=-buildvcs=false make ${MAKE_TARGET} IPFS_PLUGINS=$IPFS_PLUGINS
 
-# Using Debian Buster because the version of busybox we're using is based on it
-# and we want to make sure the libraries we're using are compatible. That's also
-# why we're running this for the target platform.
+# Extract required runtime tools from Debian.
+# We use Debian instead of Alpine because we need glibc compatibility
+# for the busybox base image we're using.
 FROM debian:bookworm-slim AS utilities
 RUN set -eux; \
 	apt-get update; \
-	apt-get install -y \
+	apt-get install -y --no-install-recommends \
 		tini \
     # Using gosu (~2MB) instead of su-exec (~20KB) because it's easier to
     # install on Debian. Useful links:
     # - https://github.com/ncopa/su-exec#why-reinvent-gosu
     # - https://github.com/tianon/gosu/issues/52#issuecomment-441946745
 		gosu \
-    # This installs fusermount which we later copy over to the target image.
+    # fusermount enables IPFS mount commands
     fuse \
     ca-certificates \
 	; \
-	rm -rf /var/lib/apt/lists/*
+	apt-get clean; \
+	rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
-# Now comes the actual target image, which aims to be as small as possible.
+# Final minimal image with shell for debugging (busybox provides sh)
 FROM busybox:stable-glibc
 
-# Get the ipfs binary, entrypoint script, and TLS CAs from the build container.
+# Copy ipfs binary, startup scripts, and runtime dependencies
 ENV SRC_DIR=/kubo
 COPY --from=utilities /usr/sbin/gosu /sbin/gosu
 COPY --from=utilities /usr/bin/tini /sbin/tini
 COPY --from=utilities /bin/fusermount /usr/local/bin/fusermount
 COPY --from=utilities /etc/ssl/certs /etc/ssl/certs
 COPY --from=builder $SRC_DIR/cmd/ipfs/ipfs /usr/local/bin/ipfs
-COPY --from=builder $SRC_DIR/bin/container_daemon /usr/local/bin/start_ipfs
+COPY --from=builder --chmod=755 $SRC_DIR/bin/container_daemon /usr/local/bin/start_ipfs
 COPY --from=builder $SRC_DIR/bin/container_init_run /usr/local/bin/container_init_run
 
-# Add suid bit on fusermount so it will run properly
+# Set SUID for fusermount to enable FUSE mounting by non-root user
 RUN chmod 4755 /usr/local/bin/fusermount
 
-# Fix permissions on start_ipfs (ignore the build machine's permissions)
-RUN chmod 0755 /usr/local/bin/start_ipfs
-
-# Swarm TCP; should be exposed to the public
-EXPOSE 4001
-# Swarm UDP; should be exposed to the public
-EXPOSE 4001/udp
-# Daemon API; must not be exposed publicly but to client services under you control
+# Swarm P2P port (TCP/UDP) - expose publicly for peer connections
+EXPOSE 4001 4001/udp
+# API port - keep private, only for trusted clients
 EXPOSE 5001
-# Web Gateway; can be exposed publicly with a proxy, e.g. as https://ipfs.example.org
+# Gateway port - can be exposed publicly via reverse proxy
 EXPOSE 8080
-# Swarm Websockets; must be exposed publicly when the node is listening using the websocket transport (/ipX/.../tcp/8081/ws).
+# Swarm WebSockets - expose publicly for browser-based peers
 EXPOSE 8081
 
-# Create the fs-repo directory and switch to a non-privileged user.
+# Create ipfs user (uid 1000) and required directories with proper ownership
 ENV IPFS_PATH=/data/ipfs
-RUN mkdir -p $IPFS_PATH \
+RUN mkdir -p $IPFS_PATH /ipfs /ipns /mfs /container-init.d \
   && adduser -D -h $IPFS_PATH -u 1000 -G users ipfs \
-  && chown ipfs:users $IPFS_PATH
+  && chown ipfs:users $IPFS_PATH /ipfs /ipns /mfs /container-init.d
 
-# Create mount points for `ipfs mount` command
-RUN mkdir /ipfs /ipns /mfs \
-  && chown ipfs:users /ipfs /ipns /mfs
-
-# Create the init scripts directory
-RUN mkdir /container-init.d \
-  && chown ipfs:users /container-init.d
-
-# Expose the fs-repo as a volume.
-# start_ipfs initializes an fs-repo if none is mounted.
-# Important this happens after the USER directive so permissions are correct.
+# Volume for IPFS repository data persistence
 VOLUME $IPFS_PATH
 
 # The default logging level
 ENV GOLOG_LOG_LEVEL=""
 
-# This just makes sure that:
-# 1. There's an fs-repo, and initializes one if there isn't.
-# 2. The API and Gateway are accessible from outside the container.
+# Entrypoint initializes IPFS repo if needed and configures networking.
+# tini ensures proper signal handling and zombie process cleanup
 ENTRYPOINT ["/sbin/tini", "--", "/usr/local/bin/start_ipfs"]
 
-# Healthcheck for the container
-# QmUNLLsPACCz1vLxQVkXqqLX5R1X345qqfHbsf67hvA3Nn is the CID of empty folder
+# Health check verifies IPFS daemon is responsive.
+# Uses empty directory CID (QmUNLLsPACCz1vLxQVkXqqLX5R1X345qqfHbsf67hvA3Nn) as test
 HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
   CMD ipfs --api=/ip4/127.0.0.1/tcp/5001 dag stat /ipfs/QmUNLLsPACCz1vLxQVkXqqLX5R1X345qqfHbsf67hvA3Nn || exit 1
 
-# Execute the daemon subcommand by default
+# Default: run IPFS daemon with auto-migration enabled
 CMD ["daemon", "--migrate=true", "--agent-version-suffix=docker"]