You've already forked Docker-Image-Extract
							
							
		
			
				
	
	
		
			288 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Bash
		
	
	
	
	
	
			
		
		
	
	
			288 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Bash
		
	
	
	
	
	
| #!/bin/sh
 | |
| #
 | |
| # This script pulls and extracts all files from an image in Docker Hub.
 | |
| #
 | |
| # Copyright (c) 2020-2023, Jeremy Lin
 | |
| #
 | |
| # Permission is hereby granted, free of charge, to any person obtaining a
 | |
| # copy of this software and associated documentation files (the "Software"),
 | |
| # to deal in the Software without restriction, including without limitation
 | |
| # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 | |
| # and/or sell copies of the Software, and to permit persons to whom the
 | |
| # Software is furnished to do so, subject to the following conditions:
 | |
| #
 | |
| # The above copyright notice and this permission notice shall be included in
 | |
| # all copies or substantial portions of the Software.
 | |
| #
 | |
| # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | |
| # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | |
| # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | |
| # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | |
| # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 | |
| # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 | |
| # DEALINGS IN THE SOFTWARE.
 | |
| 
 | |
| PLATFORM_DEFAULT="linux/amd64"
 | |
| PLATFORM="${PLATFORM_DEFAULT}"
 | |
| OUT_DIR="./output"
 | |
| 
 | |
| usage() {
 | |
|     echo "This script pulls and extracts all files from an image in Docker Hub."
 | |
|     echo
 | |
|     echo "$0 [OPTIONS...] IMAGE[:REF]"
 | |
|     echo
 | |
|     echo "IMAGE can be a community user image (like 'some-user/some-image') or a"
 | |
|     echo "Docker official image (like 'hello-world', which contains no '/')."
 | |
|     echo
 | |
|     echo "REF is either a tag name or a full SHA-256 image digest (with a 'sha256:' prefix)."
 | |
|     echo "The default ref is the 'latest' tag."
 | |
|     echo
 | |
|     echo "Options:"
 | |
|     echo
 | |
|     echo "  -p PLATFORM  Pull image for the specified platform (default: ${PLATFORM})"
 | |
|     echo "               For a given image on Docker Hub, the 'Tags' tab lists the"
 | |
|     echo "               platforms supported for that image."
 | |
|     echo "  -o OUT_DIR   Extract image to the specified output dir (default: ${OUT_DIR})"
 | |
|     echo "  -h           Show help with usage examples"
 | |
| }
 | |
| 
 | |
| usage_detailed() {
 | |
|     usage
 | |
|     echo
 | |
|     echo "Examples:"
 | |
|     echo
 | |
|     echo "# Pull and extract all files in the 'hello-world' image tagged 'latest'."
 | |
|     echo "\$ $0 hello-world:latest"
 | |
|     echo
 | |
|     echo "# Same as above; ref defaults to the 'latest' tag."
 | |
|     echo "\$ $0 hello-world"
 | |
|     echo
 | |
|     echo "# Pull the 'hello-world' image for the 'linux/arm64/v8' platform."
 | |
|     echo "\$ $0 -p linux/arm64/v8 hello-world"
 | |
|     echo
 | |
|     echo "# Pull an image by digest."
 | |
|     echo "\$ $0 hello-world:sha256:90659bf80b44ce6be8234e6ff90a1ac34acbeb826903b02cfa0da11c82cbc042"
 | |
| }
 | |
| 
 | |
| if [ $# -eq 0 ]; then
 | |
|     usage_detailed
 | |
|     exit 0
 | |
| fi
 | |
| 
 | |
| while getopts ':ho:p:' opt; do
 | |
|     case $opt in
 | |
|         o)
 | |
|             OUT_DIR="${OPTARG}"
 | |
|             ;;
 | |
|         p)
 | |
|             PLATFORM="${OPTARG}"
 | |
|             ;;
 | |
|         h)
 | |
|             usage_detailed
 | |
|             exit 0
 | |
|             ;;
 | |
|         \?)
 | |
|             echo "ERROR: Invalid option '-$OPTARG'."
 | |
|             echo
 | |
|             usage
 | |
|             exit 1
 | |
|             ;;
 | |
|         \:) echo "ERROR: Argument required for option '-$OPTARG'."
 | |
|             echo
 | |
|             usage
 | |
|             exit 1
 | |
|             ;;
 | |
|     esac
 | |
| done
 | |
| shift $(($OPTIND - 1))
 | |
| 
 | |
| if [ $# -eq 0 ]; then
 | |
|     echo "ERROR: Image to pull must be specified."
 | |
|     echo
 | |
|     usage
 | |
|     exit 1
 | |
| fi
 | |
| 
 | |
| if [ -e "${OUT_DIR}" ]; then
 | |
|     if [ -d "${OUT_DIR}" ]; then
 | |
|         echo "WARNING: Output dir already exists. If it contains a previous extracted image,"
 | |
|         echo "there may be errors when trying to overwrite files with read-only permissions."
 | |
|         echo
 | |
|     else
 | |
|         echo "ERROR: Output dir already exists, but is not a directory."
 | |
|         exit 1
 | |
|     fi
 | |
| fi
 | |
| 
 | |
| have_curl() {
 | |
|     command -v curl >/dev/null
 | |
| }
 | |
| 
 | |
| have_wget() {
 | |
|     command -v wget >/dev/null
 | |
| }
 | |
| 
 | |
| if ! have_curl && ! have_wget; then
 | |
|     echo "This script requires either curl or wget."
 | |
|     exit 1
 | |
| fi
 | |
| 
 | |
| image_spec="$1"
 | |
| image="${image_spec%%:*}"
 | |
| if [ "${image#*/}" = "${image}" ]; then
 | |
|     # Docker official images are in the 'library' namespace.
 | |
|     image="library/${image}"
 | |
| fi
 | |
| ref="${image_spec#*:}"
 | |
| if [ "${ref}" = "${image_spec}" ]; then
 | |
|     echo "Defaulting ref to tag 'latest'..."
 | |
|     ref=latest
 | |
| fi
 | |
| 
 | |
| # Split platform (OS/arch/variant) into separate variables.
 | |
| # A platform specifier doesn't always include the `variant` component.
 | |
| OLD_IFS="${IFS}"
 | |
| IFS=/ read -r OS ARCH VARIANT <<EOF
 | |
| ${PLATFORM}
 | |
| EOF
 | |
| IFS="${OLD_IFS}"
 | |
| 
 | |
| # Given a JSON input on stdin, extract the string value associated with the
 | |
| # specified key. This avoids an extra dependency on a tool like `jq`.
 | |
| extract() {
 | |
|     local key="$1"
 | |
|     # Extract "<key>":"<val>" (assumes key/val won't contain double quotes).
 | |
|     # The colon may have whitespace on either side.
 | |
|     grep -o "\"${key}\"[[:space:]]*:[[:space:]]*\"[^\"]\+\"" |
 | |
|     # Extract just <val> by deleting the last '"', and then greedily deleting
 | |
|     # everything up to '"'.
 | |
|     sed -e 's/"$//' -e 's/.*"//'
 | |
| }
 | |
| 
 | |
| # Fetch a URL to stdout. Up to two header arguments may be specified:
 | |
| #
 | |
| #   fetch <url> [name1: value1] [name2: value2]
 | |
| #
 | |
| fetch() {
 | |
|     if have_curl; then
 | |
|         if [ $# -eq 2 ]; then
 | |
|             set -- -H "$2" "$1"
 | |
|         elif [ $# -eq 3 ]; then
 | |
|             set -- -H "$2" -H "$3" "$1"
 | |
|         fi
 | |
|         curl -sSL "$@"
 | |
|     else
 | |
|         if [ $# -eq 2 ]; then
 | |
|             set -- --header "$2" "$1"
 | |
|         elif [ $# -eq 3 ]; then
 | |
|             set -- --header "$2" --header "$3" "$1"
 | |
|         fi
 | |
|         wget -qO- "$@"
 | |
|     fi
 | |
| }
 | |
| 
 | |
| # https://docs.docker.com/docker-hub/api/latest/#tag/repositories
 | |
| manifest_list_url="https://hub.docker.com/v2/repositories/${image}/tags/${ref}"
 | |
| 
 | |
| # If the ref is already a SHA-256 image digest, then we don't need to look up anything.
 | |
| if [ -z "${ref##sha256:*}" ]; then
 | |
|     digest="${ref}"
 | |
| else
 | |
|     echo "Getting multi-arch manifest list..."
 | |
|     NL='
 | |
| '
 | |
|     digest=$(fetch "${manifest_list_url}" |
 | |
|         # Break up the single-line JSON output into separate lines by adding
 | |
|         # newlines before and after the chars '[', ']', '{', and '}'.
 | |
|         # This uses the \${NL} syntax because some BSD variants of sed don't
 | |
|         # support \n syntax in the replacement string, but instead require
 | |
|         # a literal newline preceded by a backslash.
 | |
|         sed -e 's/\([][{}]\)/\'"${NL}"'\1\'"${NL}"'/g' |
 | |
|         # Extract the "images":[...] list.
 | |
|         sed -n '/"images":/,/]/ p' |
 | |
|         # Each image's details are now on a separate line, e.g.
 | |
|         # "architecture":"arm64","features":"","variant":"v8","digest":"sha256:054c85801c4cb41511b176eb0bf13a2c4bbd41611ddd70594ec3315e88813524","os":"linux","os_features":"","os_version":null,"size":828724,"status":"active","last_pulled":"2022-09-02T22:46:48.240632Z","last_pushed":"2022-09-02T00:42:45.69226Z"
 | |
|         # The image details are interspersed with lines of stray punctuation,
 | |
|         # so grep for an arbitrary string that must be in these lines.
 | |
|         grep architecture |
 | |
|         # Search for an image that matches the platform.
 | |
|         while read -r image; do
 | |
|             # Arch is probably most likely to be unique, so check that first.
 | |
|             arch="$(echo ${image} | extract 'architecture')"
 | |
|             if [ "${arch}" != "${ARCH}" ]; then continue; fi
 | |
| 
 | |
|             os="$(echo ${image} | extract 'os')"
 | |
|             if [ "${os}" != "${OS}" ]; then continue; fi
 | |
| 
 | |
|             variant="$(echo ${image} | extract 'variant')"
 | |
|             if [ "${variant}" = "${VARIANT}" ]; then
 | |
|                 echo ${image} | extract 'digest'
 | |
|                 break
 | |
|             fi
 | |
|         done)
 | |
| 
 | |
|     if [ -n "${digest}" ]; then
 | |
|         echo "Platform ${PLATFORM} resolved to '${digest}'..."
 | |
|     else
 | |
|         echo "No image digest found. Verify that the image, ref, and platform are valid."
 | |
|         exit 1
 | |
|     fi
 | |
| fi
 | |
| 
 | |
| # https://docs.docker.com/registry/spec/auth/token/#how-to-authenticate
 | |
| api_token_url="https://auth.docker.io/token?service=registry.docker.io&scope=repository:$image:pull"
 | |
| 
 | |
| # https://github.com/docker/distribution/blob/master/docs/spec/api.md#pulling-an-image-manifest
 | |
| manifest_url="https://registry-1.docker.io/v2/${image}/manifests/${digest}"
 | |
| 
 | |
| # https://github.com/docker/distribution/blob/master/docs/spec/api.md#pulling-a-layer
 | |
| blobs_base_url="https://registry-1.docker.io/v2/${image}/blobs"
 | |
| 
 | |
| echo "Getting API token..."
 | |
| token=$(fetch "${api_token_url}" | extract 'token')
 | |
| auth_header="Authorization: Bearer $token"
 | |
| 
 | |
| # https://github.com/distribution/distribution/blob/main/docs/spec/manifest-v2-2.md
 | |
| docker_manifest_v2="application/vnd.docker.distribution.manifest.v2+json"
 | |
| 
 | |
| # https://github.com/opencontainers/image-spec/blob/main/manifest.md
 | |
| oci_manifest_v1="application/vnd.oci.image.manifest.v1+json"
 | |
| 
 | |
| # Docker Hub can return either type of manifest format. Most images seem to
 | |
| # use the Docker format for now, but the OCI format will likely become more
 | |
| # common as features that require that format become enabled by default
 | |
| # (e.g., https://github.com/docker/build-push-action/releases/tag/v3.3.0).
 | |
| accept_header="Accept: ${docker_manifest_v2},${oci_manifest_v1}"
 | |
| 
 | |
| echo "Getting image manifest for $image:$ref..."
 | |
| layers=$(fetch "${manifest_url}" "${auth_header}" "${accept_header}" |
 | |
|              # Extract `digest` values only after the `layers` section appears.
 | |
|              sed -n '/"layers":/,$ p' |
 | |
|              extract 'digest')
 | |
| 
 | |
| if [ -z "${layers}" ]; then
 | |
|     echo "No layers returned. Verify that the image and ref are valid."
 | |
|     exit 1
 | |
| fi
 | |
| 
 | |
| mkdir -p "${OUT_DIR}"
 | |
| 
 | |
| for layer in $layers; do
 | |
|     hash="${layer#sha256:}"
 | |
|     echo "Fetching and extracting layer ${hash}..."
 | |
|     fetch "${blobs_base_url}/${layer}" "${auth_header}" | gzip -d | tar -C "${OUT_DIR}" -xf -
 | |
|     # Ref: https://github.com/moby/moby/blob/master/image/spec/v1.2.md#creating-an-image-filesystem-changeset
 | |
|     #      https://github.com/moby/moby/blob/master/pkg/archive/whiteouts.go
 | |
|     # Search for "whiteout" files to indicate files deleted in this layer.
 | |
|     OLD_IFS="${IFS}"
 | |
|     find "${OUT_DIR}" -name '.wh.*' | while IFS= read -r f; do
 | |
|         dir="${f%/*}"
 | |
|         wh_file="${f##*/}"
 | |
|         file="${wh_file#.wh.}"
 | |
|         # Delete both the whiteout file and the whited-out file.
 | |
|         rm -rf "${dir}/${wh_file}" "${dir}/${file}"
 | |
|     done
 | |
|     IFS="${OLD_IFS}"
 | |
| done
 | |
| 
 | |
| echo "Image contents extracted into ${OUT_DIR}." |