kvm-remote.sh 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. #!/bin/bash
  2. # SPDX-License-Identifier: GPL-2.0+
  3. #
  4. # Run a series of tests on remote systems under KVM.
  5. #
  6. # Usage: kvm-remote.sh "systems" [ <kvm.sh args> ]
  7. # kvm-remote.sh "systems" /path/to/old/run [ <kvm-again.sh args> ]
  8. #
  9. # Copyright (C) 2021 Facebook, Inc.
  10. #
  11. # Authors: Paul E. McKenney <paulmck@kernel.org>
  12. scriptname=$0
  13. args="$*"
  14. if ! test -d tools/testing/selftests/rcutorture/bin
  15. then
  16. echo $scriptname must be run from top-level directory of kernel source tree.
  17. exit 1
  18. fi
  19. RCUTORTURE="`pwd`/tools/testing/selftests/rcutorture"; export RCUTORTURE
  20. PATH=${RCUTORTURE}/bin:$PATH; export PATH
  21. . functions.sh
  22. starttime="`get_starttime`"
  23. systems="$1"
  24. if test -z "$systems"
  25. then
  26. echo $scriptname: Empty list of systems will go nowhere good, giving up.
  27. exit 1
  28. fi
  29. shift
  30. # Pathnames:
  31. # T: /tmp/kvm-remote.sh.NNNNNN where "NNNNNN" is set by mktemp
  32. # resdir: /tmp/kvm-remote.sh.NNNNNN/res
  33. # rundir: /tmp/kvm-remote.sh.NNNNNN/res/$ds ("-remote" suffix)
  34. # oldrun: `pwd`/tools/testing/.../res/$otherds
  35. #
  36. # Pathname segments:
  37. # TD: kvm-remote.sh.NNNNNN
  38. # ds: yyyy.mm.dd-hh.mm.ss-remote
  39. T="`mktemp -d ${TMPDIR-/tmp}/kvm-remote.sh.XXXXXX`"
  40. trap 'rm -rf $T' 0
  41. TD="`basename "$T"`"
  42. resdir="$T/res"
  43. ds=`date +%Y.%m.%d-%H.%M.%S`-remote
  44. rundir=$resdir/$ds
  45. echo Results directory: $rundir
  46. echo $scriptname $args
  47. if echo $1 | grep -q '^--'
  48. then
  49. # Fresh build. Create a datestamp unless the caller supplied one.
  50. datestamp="`echo "$@" | awk -v ds="$ds" '{
  51. for (i = 1; i < NF; i++) {
  52. if ($i == "--datestamp") {
  53. ds = "";
  54. break;
  55. }
  56. }
  57. if (ds != "")
  58. print "--datestamp " ds;
  59. }'`"
  60. kvm.sh --remote "$@" $datestamp --buildonly > $T/kvm.sh.out 2>&1
  61. ret=$?
  62. if test "$ret" -ne 0
  63. then
  64. echo $scriptname: kvm.sh failed exit code $?
  65. cat $T/kvm.sh.out
  66. exit 2
  67. fi
  68. oldrun="`grep -m 1 "^Results directory: " $T/kvm.sh.out | awk '{ print $3 }'`"
  69. touch "$oldrun/remote-log"
  70. echo $scriptname $args >> "$oldrun/remote-log"
  71. echo | tee -a "$oldrun/remote-log"
  72. echo " ----" kvm.sh output: "(`date`)" | tee -a "$oldrun/remote-log"
  73. cat $T/kvm.sh.out | tee -a "$oldrun/remote-log"
  74. # We are going to run this, so remove the buildonly files.
  75. rm -f "$oldrun"/*/buildonly
  76. kvm-again.sh $oldrun --dryrun --remote --rundir "$rundir" > $T/kvm-again.sh.out 2>&1
  77. ret=$?
  78. if test "$ret" -ne 0
  79. then
  80. echo $scriptname: kvm-again.sh failed exit code $? | tee -a "$oldrun/remote-log"
  81. cat $T/kvm-again.sh.out | tee -a "$oldrun/remote-log"
  82. exit 2
  83. fi
  84. else
  85. # Re-use old run.
  86. oldrun="$1"
  87. if ! echo $oldrun | grep -q '^/'
  88. then
  89. oldrun="`pwd`/$oldrun"
  90. fi
  91. shift
  92. touch "$oldrun/remote-log"
  93. echo $scriptname $args >> "$oldrun/remote-log"
  94. kvm-again.sh "$oldrun" "$@" --dryrun --remote --rundir "$rundir" > $T/kvm-again.sh.out 2>&1
  95. ret=$?
  96. if test "$ret" -ne 0
  97. then
  98. echo $scriptname: kvm-again.sh failed exit code $? | tee -a "$oldrun/remote-log"
  99. cat $T/kvm-again.sh.out | tee -a "$oldrun/remote-log"
  100. exit 2
  101. fi
  102. cp -a "$rundir" "$RCUTORTURE/res/"
  103. oldrun="$RCUTORTURE/res/$ds"
  104. fi
  105. echo | tee -a "$oldrun/remote-log"
  106. echo " ----" kvm-again.sh output: "(`date`)" | tee -a "$oldrun/remote-log"
  107. cat $T/kvm-again.sh.out
  108. echo | tee -a "$oldrun/remote-log"
  109. echo Remote run directory: $rundir | tee -a "$oldrun/remote-log"
  110. echo Local build-side run directory: $oldrun | tee -a "$oldrun/remote-log"
  111. # Create the kvm-remote-N.sh scripts in the bin directory.
  112. awk < "$rundir"/scenarios -v dest="$T/bin" -v rundir="$rundir" '
  113. {
  114. n = $1;
  115. sub(/\./, "", n);
  116. fn = dest "/kvm-remote-" n ".sh"
  117. print "kvm-remote-noreap.sh " rundir " &" > fn;
  118. scenarios = "";
  119. for (i = 2; i <= NF; i++)
  120. scenarios = scenarios " " $i;
  121. print "kvm-test-1-run-batch.sh" scenarios >> fn;
  122. print "sync" >> fn;
  123. print "rm " rundir "/remote.run" >> fn;
  124. }'
  125. chmod +x $T/bin/kvm-remote-*.sh
  126. ( cd "`dirname $T`"; tar -chzf $T/binres.tgz "$TD/bin" "$TD/res" )
  127. # Check first to avoid the need for cleanup for system-name typos
  128. for i in $systems
  129. do
  130. ssh -o BatchMode=yes $i getconf _NPROCESSORS_ONLN > $T/ssh.stdout 2> $T/ssh.stderr
  131. ret=$?
  132. if test "$ret" -ne 0
  133. then
  134. echo "System $i unreachable ($ret), giving up." | tee -a "$oldrun/remote-log"
  135. echo ' --- ssh stdout: vvv' | tee -a "$oldrun/remote-log"
  136. cat $T/ssh.stdout | tee -a "$oldrun/remote-log"
  137. echo ' --- ssh stdout: ^^^' | tee -a "$oldrun/remote-log"
  138. echo ' --- ssh stderr: vvv' | tee -a "$oldrun/remote-log"
  139. cat $T/ssh.stderr | tee -a "$oldrun/remote-log"
  140. echo ' --- ssh stderr: ^^^' | tee -a "$oldrun/remote-log"
  141. exit 4
  142. fi
  143. echo $i: `cat $T/ssh.stdout` CPUs " " `date` | tee -a "$oldrun/remote-log"
  144. done
  145. # Download and expand the tarball on all systems.
  146. echo Build-products tarball: `du -h $T/binres.tgz` | tee -a "$oldrun/remote-log"
  147. for i in $systems
  148. do
  149. echo Downloading tarball to $i `date` | tee -a "$oldrun/remote-log"
  150. cat $T/binres.tgz | ssh -o BatchMode=yes $i "cd /tmp; tar -xzf -"
  151. ret=$?
  152. tries=0
  153. while test "$ret" -ne 0
  154. do
  155. echo Unable to download $T/binres.tgz to system $i, waiting and then retrying. $tries prior retries. | tee -a "$oldrun/remote-log"
  156. sleep 60
  157. cat $T/binres.tgz | ssh -o BatchMode=yes $i "cd /tmp; tar -xzf -"
  158. ret=$?
  159. if test "$ret" -ne 0
  160. then
  161. if test "$tries" > 5
  162. then
  163. echo Unable to download $T/binres.tgz to system $i, giving up. | tee -a "$oldrun/remote-log"
  164. exit 10
  165. fi
  166. fi
  167. tries=$((tries+1))
  168. done
  169. done
  170. # Function to check for presence of a file on the specified system.
  171. # Complain if the system cannot be reached, and retry after a wait.
  172. # Currently just waits 15 minutes if a machine disappears.
  173. #
  174. # Usage: checkremotefile system pathname
  175. checkremotefile () {
  176. local nsshfails=0
  177. local ret
  178. local sleeptime=60
  179. while :
  180. do
  181. ssh -o BatchMode=yes $1 "test -f \"$2\""
  182. ret=$?
  183. if test "$ret" -eq 255
  184. then
  185. echo " ---" ssh failure to $1 checking for file $2, retry after $sleeptime seconds. `date` | tee -a "$oldrun/remote-log"
  186. nsshfails=$((nsshfails+1))
  187. if ((nsshfails > 15))
  188. then
  189. return 255
  190. fi
  191. elif test "$ret" -eq 0
  192. then
  193. return 0
  194. elif test "$ret" -eq 1
  195. then
  196. echo " ---" File \"$2\" not found: ssh $1 test -f \"$2\" | tee -a "$oldrun/remote-log"
  197. return 1
  198. else
  199. echo " ---" Exit code $ret: ssh $1 test -f \"$2\", retry after $sleeptime seconds. `date` | tee -a "$oldrun/remote-log"
  200. return $ret
  201. fi
  202. sleep $sleeptime
  203. done
  204. }
  205. # Function to start batches on idle remote $systems
  206. #
  207. # Usage: startbatches curbatch nbatches
  208. #
  209. # Batches are numbered starting at 1. Returns the next batch to start.
  210. # Be careful to redirect all debug output to FD 2 (stderr).
  211. startbatches () {
  212. local curbatch="$1"
  213. local nbatches="$2"
  214. local ret
  215. # Each pass through the following loop examines one system.
  216. for i in $systems
  217. do
  218. if test "$curbatch" -gt "$nbatches"
  219. then
  220. echo $((nbatches + 1))
  221. return 0
  222. fi
  223. if checkremotefile "$i" "$resdir/$ds/remote.run" 1>&2
  224. then
  225. continue # System still running last test, skip.
  226. fi
  227. ssh -o BatchMode=yes "$i" "cd \"$resdir/$ds\"; touch remote.run; PATH=\"$T/bin:$PATH\" nohup kvm-remote-$curbatch.sh > kvm-remote-$curbatch.sh.out 2>&1 &" 1>&2
  228. ret=$?
  229. if test "$ret" -ne 0
  230. then
  231. echo ssh $i failed: exitcode $ret 1>&2
  232. exit 11
  233. fi
  234. echo " ----" System $i Batch `head -n $curbatch < "$rundir"/scenarios | tail -1` `date` 1>&2
  235. curbatch=$((curbatch + 1))
  236. done
  237. echo $curbatch
  238. }
  239. # Launch all the scenarios.
  240. nbatches="`wc -l "$rundir"/scenarios | awk '{ print $1 }'`"
  241. curbatch=1
  242. while test "$curbatch" -le "$nbatches"
  243. do
  244. startbatches $curbatch $nbatches > $T/curbatch 2> $T/startbatches.stderr
  245. curbatch="`cat $T/curbatch`"
  246. if test -s "$T/startbatches.stderr"
  247. then
  248. cat "$T/startbatches.stderr" | tee -a "$oldrun/remote-log"
  249. fi
  250. if test "$curbatch" -le "$nbatches"
  251. then
  252. sleep 30
  253. fi
  254. done
  255. echo All batches started. `date` | tee -a "$oldrun/remote-log"
  256. # Wait for all remaining scenarios to complete and collect results.
  257. for i in $systems
  258. do
  259. echo " ---" Waiting for $i `date` | tee -a "$oldrun/remote-log"
  260. while :
  261. do
  262. checkremotefile "$i" "$resdir/$ds/remote.run"
  263. ret=$?
  264. if test "$ret" -eq 1
  265. then
  266. echo " ---" Collecting results from $i `date` | tee -a "$oldrun/remote-log"
  267. ( cd "$oldrun"; ssh -o BatchMode=yes $i "cd $rundir; tar -czf - kvm-remote-*.sh.out */console.log */kvm-test-1-run*.sh.out */qemu[_-]pid */qemu-retval */qemu-affinity; rm -rf $T > /dev/null 2>&1" | tar -xzf - )
  268. break;
  269. fi
  270. if test "$ret" -eq 255
  271. then
  272. echo System $i persistent ssh failure, lost results `date` | tee -a "$oldrun/remote-log"
  273. break;
  274. fi
  275. sleep 30
  276. done
  277. done
  278. ( kvm-end-run-stats.sh "$oldrun" "$starttime"; echo $? > $T/exitcode ) | tee -a "$oldrun/remote-log"
  279. exit "`cat $T/exitcode`"