I have a Gearman worker in a shell script started with perp in the following way:
runuid -s gds \
/usr/bin/gearman -h 127.0.0.1 -t 1000 -w -f gds-rel \
-- xargs /home/gds/gds-rel-worker.sh < /dev/null 2>/dev/null
The worker only does some input validation and calls another shell script run.sh that invokes bash, curl, Terragrunt, Terraform, Ansible and gcloud to provision and update resources in GCP like this:
./run.sh --release 1.2.3 2>&1 >> /var/log/gds-release
The script is intended to run unattended. The problem I have is that after the job finishes successfully (that's both shell scripts run.sh and gds-rel-worker.sh) the Gearman job remains executing, because the child process becomes zombie (see last line below).
root 144748 1 0 Apr29 ? 00:00:00 perpboot -d /etc/perp
root 144749 144748 0 Apr29 ? 00:00:00 \_ tinylog -k 8 -s 100000 -t -z /var/log/perp/perpd-root
root 144750 144748 0 Apr29 ? 00:00:00 \_ perpd /etc/perp
root 2492482 144750 0 May14 ? 00:00:00 \_ tinylog (gearmand) -k 10 -s 100000000 -t -z /var/log/perp/gearmand
gearmand 2492483 144750 0 May14 ? 00:00:08 \_ /usr/sbin/gearmand -L 127.0.0.1 -p 4730 --verbose INFO --log-file stderr --keepalive --keepalive-idle 120 --keepalive-interval 120 --keepalive-count 3 --round-robin --threads 36 --worker-wakeup 3 --job-retries 1
root 2531800 144750 0 May14 ? 00:00:00 \_ tinylog (gds-rel-worker) -k 10 -s 100000000 -t -z /var/log/perp/gds-rel-worker
gds 2531801 144750 0 May14 ? 00:00:00 \_ /usr/bin/gearman -h 127.0.0.1 -t 1000 -w -f gds-rel -- xargs /home/gds/gds-rel-worker.sh
gds 2531880 2531801 0 May14 ? 00:00:00 \_ [xargs] <defunct>
So far I have traced the problem to run.sh, because if I replace its call with something simpler (e.g. echo "Hello"; sleep 5) the worker does not hang. Unfortunately, I have no clue what is causing the problem. The script run.sh is rather long and complex, but has been working without a problem so far. Tracing the worker process I see this:
getpid() = 2531801
write(2, "gearman: ", 9) = 9
write(2, "gearman_worker_work", 19) = 19
write(2, " : ", 3) = 3
write(2, "gearman_wait(GEARMAN_TIMEOUT) ti"..., 151) = 151
write(2, "\n", 1) = 1
sendto(5, "\0REQ\0\0\0'\0\0\0\0", 12, MSG_NOSIGNAL, NULL, 0) = 12
recvfrom(5, "\0RES\0\0\0\n\0\0\0\0", 8192, MSG_NOSIGNAL, NULL, NULL) = 12
sendto(5, "\0REQ\0\0\0\4\0\0\0\0", 12, MSG_NOSIGNAL, NULL, 0) = 12
poll([{fd=5, events=POLLIN}, {fd=3, events=POLLIN}], 2, 1000) = 1 ([{fd=5, revents=POLLIN}])
sendto(5, "\0REQ\0\0\0'\0\0\0\0", 12, MSG_NOSIGNAL, NULL, 0) = 12
recvfrom(5, "\0RES\0\0\0\6\0\0\0\0\0RES\0\0\0(\0\0\0QH:terra-"..., 8192, MSG_NOSIGNAL, NULL, NULL) = 105
pipe([6, 7]) = 0
pipe([8, 9]) = 0
clone(child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7fea38480a50) = 2531880
close(6) = 0
close(9) = 0
write(7, "1.2.3\n", 18) = 6
close(7) = 0
read(8, "which: no terraform-0.14 in (/us"..., 1024) = 80
read(8, "Identity added: /home/gds/.ssh/i"..., 1024) = 54
read(8, 0x7fff6251f5b0, 1024) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=2531880, si_uid=1006, si_status=0, si_utime=0, si_stime=0} ---
read(8,
So the worker continues reading standard output even though the child has finished successfully and presumably closed it. Any ideas how to catch what causes this problem?
I was able to solve it. The script run.sh was starting ssh-agent, which opens a socket and since Gearman redirects all outputs the worker continued reading the open file descriptor even after the script successfully completed.
I found it by examining the open file descriptors for the Gearman worker process after it hang:
# ls -l /proc/2531801/fd/*
lr-x------. 1 gds devops 64 May 17 11:26 /proc/2531801/fd/0 -> /dev/null
l-wx------. 1 gds devops 64 May 17 11:26 /proc/2531801/fd/1 -> 'pipe:[9356665]'
l-wx------. 1 gds devops 64 May 17 11:26 /proc/2531801/fd/2 -> 'pipe:[9356665]'
lr-x------. 1 gds devops 64 May 17 11:26 /proc/2531801/fd/3 -> 'pipe:[9357481]'
l-wx------. 1 gds devops 64 May 17 11:26 /proc/2531801/fd/4 -> 'pipe:[9357481]'
lrwx------. 1 gds devops 64 May 17 11:26 /proc/2531801/fd/5 -> 'socket:[9357482]'
lr-x------. 1 gds devops 64 May 17 11:26 /proc/2531801/fd/8 -> 'pipe:[9369888]'
Then identified the processes using file node for the pipe in file descriptor 8 that German worker continued reading:
# lsof | grep 9369888
gearman 2531801 gds 8r FIFO 0,13 0t0 9369888 pipe
ssh-agent 2531899 gds 9w FIFO 0,13 0t0 9369888 pipe
And finally listed files opened by ssh-agent and found what stands behind file descriptor 3:
# ls -l /proc/2531899/fd/*
lrwx------. 1 root root 64 May 17 11:14 /proc/2531899/fd/0 -> /dev/null
lrwx------. 1 root root 64 May 17 11:14 /proc/2531899/fd/1 -> /dev/null
lrwx------. 1 root root 64 May 17 11:14 /proc/2531899/fd/2 -> /dev/null
lrwx------. 1 root root 64 May 17 11:14 /proc/2531899/fd/3 -> 'socket:[9346577]'
# lsof | grep 9346577
ssh-agent 2531899 gds 3u unix 0xffff89016fd34000 0t0 9346577 /tmp/ssh-0b14coFWhy40/agent.2531898 type=STREAM
As a solution I added kill of the ssh-agent before exit from run.sh script and now there are no jobs hanging due to zombie process.
Here is the 'smem' command I run on the Redhat/CentOS Linux system. I expect the output be printed without the fields with zero size however I would expect the heading columns.
smem -kt -c "pid user command swap"
PID User Command Swap
7894 root /sbin/agetty --noclear tty1 0
9666 root ./nimbus /opt/nimsoft 0
7850 root /sbin/auditd 236.0K
7885 root /usr/sbin/irqbalance --fore 0
11205 root nimbus(hdb) 0
10701 root nimbus(spooler) 0
8446 trapsanalyzer1 /opt/traps/analyzerd/analyz 0
50316 apache /usr/sbin/httpd -DFOREGROUN 0
50310 apache /usr/sbin/httpd -DFOREGROUN 0
3971 root /usr/sbin/lvmetad -f 36.0K
63988 root su - 0
7905 ntp /usr/sbin/ntpd -u ntp:ntp - 4.0K
7876 dbus /usr/bin/dbus-daemon --syst 44.0K
9672 root nimbus(controller) 0
7888 root /usr/lib/systemd/systemd-lo 0
63990 root -bash 0
59978 postfix pickup -l -t unix -u 0
3977 root /usr/lib/systemd/systemd-ud 736.0K
9016 postfix qmgr -l -t unix -u 0
50303 root /usr/sbin/httpd -DFOREGROUN 0
3941 root /usr/lib/systemd/systemd-jo 52.0K
8199 root //usr/lib/vmware-caf/pme/bi 0
8598 daemon /opt/quest/sbin/.vasd -p /v 0
8131 root /usr/sbin/vmtoolsd 0
7881 root /usr/sbin/NetworkManager -- 8.0K
8364 root /opt/puppetlabs/puppet/bin/ 0
8616 daemon /opt/quest/sbin/.vasd -p /v 0
23290 root /usr/sbin/rsyslogd -n 3.8M
64091 root python /bin/smem -kt -c pid 0
7887 polkitd /usr/lib/polkit-1/polkitd - 0
8363 root /usr/bin/python2 -Es /usr/s 0
53606 root /usr/share/metricbeat/bin/m 0
24631 nagios /usr/local/ncpa/ncpa_passiv 0
24582 nagios /usr/local/ncpa/ncpa_listen 0
7886 root /opt/traps/bin/authorized 76.0K
7872 root /opt/traps/bin/pmd 12.0K
8374 root /opt/puppetlabs/puppet/bin/ 0
7883 root /opt/traps/bin/trapsd 64.0K
----------------------------------------------------
54 10 5.1M
Like this?:
$ awk '$NF!=0' file
PID User Command Swap
7850 root /sbin/auditd 236.0K
...
7883 root /opt/traps/bin/trapsd 64.0K
----------------------------------------------------
54 10 5.1M
But instead of using the form awk ... file you'd probably like to smem ... | awk '$NF!=0'.
Could you please try following, for extra precautions removing the space from last fields(in case it is there).
smem -kt -c "pid user command swap" | awk 'FNR==1{print;next} {sub(/[[:space:]]+$/,"")} $NF==0{next} 1'
I am new to Teradata.I am having a problem with my BTEQ program. The BTEQ job passes a variable shell script. The purpose of the program to check for any null values in the summary table that is 2 months delay. Here is an example of the table (the purpose of the program: when The Summary table is missing two months, for example, Oct and Nov then flag=Y. flag=Y means YES that there are null values in the Summary table for the month of Oct and Nov. flag=N means there are no null values):
dt_load maxmth summary
12-SEP Sep Sep
13 SEP Sep
29-SEP Sep
2-Oct Oct
3-Oct Oct
1-Nov Nov
2-Nov Nov
user="usr1"
pass="pass1"
SRC_DB="emp"
SOURCE_TABLE="summarytb"
FLAG =$(query_td)
td_query () { bteq << EOF |grep '^>' |sed -e "s/^>//"
${HOST}/${USER},${PASSWORD}
DATABASE $SRC_DB;
.set width 1000;
.set titledashes off;
SELECT COUNT(*)
FROM ${SRC_DB}.${SOURCE_TABLE};
( SELECT
dt_load,
summary ,
maxmth,
/*check for null 2 months back*/
MDIFF(summary, 2, maxmth, dt_load) AS diff1
FROM $SOURCE_TABLE
Group by dt_load) AS a
FROM
/*if summary field is null than insert 2*/
( SELECT *,
COALESCE(a.diff1,2) AS diff (Select FROM a) AS b
FROM
/* if diff= 2 than flag =Y else flag=N*/
( SELECT *,
CASE
When b.diff = 2 then āY ā ELSE āNā
End ) AS flag (SELECT c.diff FROM b) c
FROM
/*pass flag variable to shell script*/
SELECT '>'||' c.flag';
$1
.LOGOFF;
.QUIT;
.EXIT
EOF
}
echo $FLAG
if [ $FLAG='Y' ] then echo Run file1;
exit 1
fi
IF [ $FLAG='N' ] then echo run file 2;
exit 1
fi
Given this C program:
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
int main(void) {
pid_t pid;
if ((pid = fork()) >= 0) {
if (pid == 0) {
exit(0);
} else {
sleep(999);
}
}
return 0;
}
after running:
$ gcc -o zombie zombie.c
$ ./zombie &
$ ps -f
only the parent process is showing:
501 77474 74651 0 2:36PM ttys000 0:00.00 ./zombie
On Linux the same program shows this:
osboxes 6900 6835 0 13:37 pts/1 00:00:00 ./zombie
osboxes 6903 6900 0 13:37 pts/1 00:00:00 [zombie] <defunct>
Now, strangely, on OSX when starting more than one zombie programs..
$ ./zombie &
$ ./zombie &
$ ps -f
.. the zombie processes (in brackets) show up:
501 77653 74651 0 2:39PM ttys000 0:00.00 ./zombie
501 77664 74651 0 2:39PM ttys000 0:00.00 ./zombie
501 77667 77664 0 2:39PM ttys000 0:00.00 (zombie)
What's the reason for this?
I used homebrew to install GNU parallel on my mac so I can run some tests remotely on my University's servers. I was quickly running through the tutorials, but when I ran
parallel -S <username>#$SERVER1 echo running on ::: <username>#$SERVER1
I got the message
parallel: Warning: Could not figure out number of cpus on <username#server> (). Using 1.
Possibly related, I never added parallel to my path and got the warning that "parallel" wasn't a recognized command, but parallel ran anyways and still echo'd correctly. This particular server has 16 cores, how can I get parallel to recognize them?
GNU Parallel is less tested on OS X as I do not have access to an OS X installation, so you have likely found a bug.
GNU Parallel has since 20120322 used these to find the number of CPUs:
sysctl -n hw.physicalcpu
sysctl -a hw 2>/dev/null | grep [^a-z]physicalcpu[^a-z] | awk '{ print \$2 }'
And the number of cores:
sysctl -n hw.logicalcpu
sysctl -a hw 2>/dev/null | grep [^a-z]logicalcpu[^a-z] | awk '{ print \$2 }'
Can you test what output you get from those?
Which version of GNU Parallel are you using?
As a work around you can force GNU Parallel to detect 16 cores:
parallel -S 16/<username>#$SERVER1 echo running on ::: <username>#$SERVER1
Since version 20140422 you have been able to export your path to the remote server:
parallel --env PATH -S 16/<username>#$SERVER1 echo running on ::: <username>#$SERVER1
That way you just need to add the dir where parallel lives on the server to your path on local machine. E.g. parallel on the remote server is in /home/u/user/bin/parallel:
PATH=$PATH:/home/u/user/bin parallel --env PATH -S <username>#$SERVER1 echo running on ::: <username>#$SERVER1
Information for Ole
My iMac (OSX MAvericks on Intel core i7) gives the following, which all looks correct:
sysctl -n hw.physicalcpu
4
sysctl -a hw
hw.ncpu: 8
hw.byteorder: 1234
hw.memsize: 17179869184
hw.activecpu: 8
hw.physicalcpu: 4
hw.physicalcpu_max: 4
hw.logicalcpu: 8
hw.logicalcpu_max: 8
hw.cputype: 7
hw.cpusubtype: 4
hw.cpu64bit_capable: 1
hw.cpufamily: 1418770316
hw.cacheconfig: 8 2 2 8 0 0 0 0 0 0
hw.cachesize: 17179869184 32768 262144 8388608 0 0 0 0 0 0
hw.pagesize: 4096
hw.busfrequency: 100000000
hw.busfrequency_min: 100000000
hw.busfrequency_max: 100000000
hw.cpufrequency: 3400000000
hw.cpufrequency_min: 3400000000
hw.cpufrequency_max: 3400000000
hw.cachelinesize: 64
hw.l1icachesize: 32768
hw.l1dcachesize: 32768
hw.l2cachesize: 262144
hw.l3cachesize: 8388608
hw.tbfrequency: 1000000000
hw.packages: 1
hw.optional.floatingpoint: 1
hw.optional.mmx: 1
hw.optional.sse: 1
hw.optional.sse2: 1
hw.optional.sse3: 1
hw.optional.supplementalsse3: 1
hw.optional.sse4_1: 1
hw.optional.sse4_2: 1
hw.optional.x86_64: 1
hw.optional.aes: 1
hw.optional.avx1_0: 1
hw.optional.rdrand: 0
hw.optional.f16c: 0
hw.optional.enfstrg: 0
hw.optional.fma: 0
hw.optional.avx2_0: 0
hw.optional.bmi1: 0
hw.optional.bmi2: 0
hw.optional.rtm: 0
hw.optional.hle: 0
hw.cputhreadtype: 1
hw.machine = x86_64
hw.model = iMac12,2
hw.ncpu = 8
hw.byteorder = 1234
hw.physmem = 2147483648
hw.usermem = 521064448
hw.pagesize = 4096
hw.epoch = 0
hw.vectorunit = 1
hw.busfrequency = 100000000
hw.cpufrequency = 3400000000
hw.cachelinesize = 64
hw.l1icachesize = 32768
hw.l1dcachesize = 32768
hw.l2settings = 1
hw.l2cachesize = 262144
hw.l3settings = 1
hw.l3cachesize = 8388608
hw.tbfrequency = 1000000000
hw.memsize = 17179869184
hw.availcpu = 8
sysctl -n hw.logicalcpu
8