CI: Fix qemu-1-setup failure, remove debug stuff

- For whatever reason, the runner will now startup with either two 75GB disks or one 150GB disk. Previously the runner was always booting with two 75GB, but about a quarter of the time it now starts up with a single 150GB disk. This caused qemu-1-setup.sh to fail since it expected the two 75GB disks. This commit updates qemu-1-setup.sh to work with either disk config. - Remove the watchdog from qemu-1-setup.sh. It didn't turn out to be useful. - Remove the timestamps that zfs-qemu.yml added to the qemu-1-setup.sh output. The timestamps were redundant, since you can already download timestamped logs from the Github web interface. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Closes #18166
2026-02-06 21:26:24 +01:00 · 2026-01-31 12:40:55 -08:00
parent b364720524
commit da9e8ff0df
2 changed files with 78 additions and 38 deletions
--- a/.github/workflows/scripts/qemu-1-setup.sh
+++ b/.github/workflows/scripts/qemu-1-setup.sh
@@ -6,13 +6,6 @@

 set -eu

-# We've been seeing this script take over 15min to run.  This may or
-# may not be normal.  Just to get a little more insight, print out
-# a message to stdout with the top running process, and do this every
-# 30 seconds.  We can delete this watchdog later once we get a better
-# handle on what the timeout value should be.
-(while [ 1 ] ; do sleep 30 && echo "[watchdog: $(ps -eo cmd --sort=-pcpu  | head -n 2 | tail -n 1)}')]"; done) &
-
 # The default 'azure.archive.ubuntu.com' mirrors can be really slow.
 # Prioritize the official Ubuntu mirrors.
 #
@@ -41,35 +34,89 @@ ssh-keygen -t ed25519 -f ~/.ssh/id_ed25519 -q -N ""
 sudo systemctl stop docker.socket
 sudo systemctl stop multipathd.socket

-# remove default swapfile and /mnt
 sudo swapoff -a
-sudo umount -l /mnt
-DISK="/dev/disk/cloud/azure_resource-part1"
-sudo sed -e "s|^$DISK.*||g" -i /etc/fstab
-sudo wipefs -aq $DISK
-sudo systemctl daemon-reload
+
+# Special case:
+#
+# For reasons unknown, the runner can boot-up with two different block device
+# configurations.  On one config you get two 75GB block devices, and on the
+# other you get a single 150GB block device. Here's what both look like:
+#
+# --- Two 75GB block devices ---
+# NAME    MAJ:MIN RM  SIZE RO TYPE MOUNTPOINTS
+# sda       8:0    0  150G  0 disk
+# ├─sda1    8:1    0  149G  0 part /
+# ├─sda14   8:14   0    4M  0 part
+# ├─sda15   8:15   0  106M  0 part /boot/efi
+# └─sda16 259:0    0  913M  0 part /boot
+#
+# lrwxrwxrwx 1 root root  9 Jan 29 18:07 azure_root -> ../../sda
+# lrwxrwxrwx 1 root root 10 Jan 29 18:07 azure_root-part1 -> ../../sda1
+# lrwxrwxrwx 1 root root 11 Jan 29 18:07 azure_root-part14 -> ../../sda14
+# lrwxrwxrwx 1 root root 11 Jan 29 18:07 azure_root-part15 -> ../../sda15
+# lrwxrwxrwx 1 root root 11 Jan 29 18:07 azure_root-part16 -> ../../sda16
+#
+# --- One 150GB block device ---
+# NAME    MAJ:MIN RM  SIZE RO TYPE MOUNTPOINTS
+# sda       8:0    0   75G  0 disk
+# ├─sda1    8:1    0   74G  0 part /
+# ├─sda14   8:14   0    4M  0 part
+# ├─sda15   8:15   0  106M  0 part /boot/efi
+# └─sda16 259:0    0  913M  0 part /boot
+# sdb       8:16   0   75G  0 disk
+# └─sdb1    8:17   0   75G  0 part
+#
+# lrwxrwxrwx 1 root root  9 Jan 29 18:07 azure_resource -> ../../sdb
+# lrwxrwxrwx 1 root root 10 Jan 29 18:07 azure_resource-part1 -> ../../sdb1
+# lrwxrwxrwx 1 root root  9 Jan 29 18:07 azure_root -> ../../sda
+# lrwxrwxrwx 1 root root 10 Jan 29 18:07 azure_root-part1 -> ../../sda1
+# lrwxrwxrwx 1 root root 11 Jan 29 18:07 azure_root-part14 -> ../../sda14
+# lrwxrwxrwx 1 root root 11 Jan 29 18:07 azure_root-part15 -> ../../sda15
+#
+# If we have the azure_resource-part1 partition, umount it, partition it, and
+# use it as our ZFS disk and swap partition.  If not, just create a file VDEV
+# and swap file and use that instead.
+
+# remove default swapfile and /mnt
+if [ -e /dev/disk/cloud/azure_resource-part1 ] ; then
+  sudo umount -l /mnt
+  DISK="/dev/disk/cloud/azure_resource-part1"
+  sudo sed -e "s|^$DISK.*||g" -i /etc/fstab
+  sudo wipefs -aq $DISK
+  sudo systemctl daemon-reload
+fi

 sudo modprobe loop
 sudo modprobe zfs

-# partition the disk as needed
-DISK="/dev/disk/cloud/azure_resource"
-sudo sgdisk --zap-all $DISK
-sudo sgdisk -p \
- -n 1:0:+16G -c 1:"swap" \
- -n 2:0:0    -c 2:"tests" \
-$DISK
-sync
-sleep 1
+if [ -e /dev/disk/cloud/azure_resource-part1 ] ; then
+  echo "We have two 75GB block devices"
+  # partition the disk as needed
+  DISK="/dev/disk/cloud/azure_resource"
+  sudo sgdisk --zap-all $DISK
+  sudo sgdisk -p \
+   -n 1:0:+16G -c 1:"swap" \
+   -n 2:0:0    -c 2:"tests" \
+   $DISK
+  sync
+  sleep 1
+
+  sudo fallocate -l 12G /test.ssd2
+  DISKS="$DISK-part2 /test.ssd2"
+
+  SWAP=$DISK-part1
+else
+  echo "We have a single 150GB block device"
+  sudo fallocate -l 72G /test.ssd2
+  SWAP=/swapfile.ssd
+  sudo fallocate -l 16G $SWAP
+  sudo chmod 600 $SWAP
+  DISKS="/test.ssd2"
+fi

 # swap with same size as RAM (16GiB)
-sudo mkswap $DISK-part1
-sudo swapon $DISK-part1
-
-# JBOD 2xdisk for OpenZFS storage (test vm's)
-SSD1="$DISK-part2"
-sudo fallocate -l 12G /test.ssd2
-SSD2=$(sudo losetup -b 4096 -f /test.ssd2 --show)
+sudo mkswap $SWAP
+sudo swapon $SWAP

 # adjust zfs module parameter and create pool
 exec 1>/dev/null
@@ -78,7 +125,7 @@ ARC_MAX=$((1024*1024*512))
 echo $ARC_MIN | sudo tee /sys/module/zfs/parameters/zfs_arc_min
 echo $ARC_MAX | sudo tee /sys/module/zfs/parameters/zfs_arc_max
 echo 1 | sudo tee /sys/module/zfs/parameters/zvol_use_blk_mq
-sudo zpool create -f -o ashift=12 zpool $SSD1 $SSD2 -O relatime=off \
+sudo zpool create -f -o ashift=12 zpool $DISKS -O relatime=off \
  -O atime=off -O xattr=sa -O compression=lz4 -O sync=disabled \
  -O redundant_metadata=none -O mountpoint=/mnt/tests

@@ -86,6 +133,3 @@ sudo zpool create -f -o ashift=12 zpool $SSD1 $SSD2 -O relatime=off \
 for i in /sys/block/s*/queue/scheduler; do
  echo "none" | sudo tee $i
 done
-
-# Kill off our watchdog
-kill $(jobs -p)
--- a/.github/workflows/zfs-qemu.yml
+++ b/.github/workflows/zfs-qemu.yml
@@ -96,11 +96,7 @@ jobs:

    - name: Setup QEMU
      timeout-minutes: 60
-      run: |
-        # Add a timestamp to each line to debug timeouts
-        while IFS=$'\n' read -r line; do
-            echo "$(date +'%H:%M:%S') $line"
-        done < <(.github/workflows/scripts/qemu-1-setup.sh)
+      run: .github/workflows/scripts/qemu-1-setup.sh

    - name: Start build machine
      timeout-minutes: 10