Rsyslog does not write to file - rsyslog

I have a very basic use case : make rsyslog listen on a given TCP port and write each line received to a specified text file. Rsyslog listens correctly on the port, and testing with logger + ngrep shows that everything is fine on the TCP part. However rsyslog never writes anything in the specified file. I am a bit puzzled I never had this issue before.
My config:
module(load="imtcp")
ruleset(name="rs1") {
# I tested both syntaxes. None of them worked
#*.* /var/log/test.log
action(type="omfile" file="/var/log/test.log")
}
input(type="imtcp" port="10514" ruleset="rs1")
The rest of the configuration is the Debian's rsyslog configuration file
sudo /usr/sbin/rsyslogd -f /etc/rsyslog.conf -N 1
rsyslogd: version 8.4.2, config validation run (level 1), master config /etc/rsyslog.conf
rsyslogd: End of config validation run. Bye.
Running /usr/sbin/rsyslogd -dn shows (as usual) a ton of output and says everything is OK. I tripled checks file permissions and other basic checks, everything is OK.
Here is the debug output I get when testing
[..]
9533.048681189:main Q:Reg/w0 : strm 0x7f4e64003930: file -1(messages) flush, buflen 142
9533.048698110:main Q:Reg/w0 : strmPhysWrite, stream 0x7f4e64003930, len 142
9533.048720759:main Q:Reg/w0 : file '/var/log/messages' opened as #10 with mode 416
9533.048740602:main Q:Reg/w0 : strm 0x7f4e64003930: opened file '/var/log/messages' for WRITE as 10
9533.048762238:main Q:Reg/w0 : strm 0x7f4e64003930: file 10 write wrote 142 bytes
9533.048788387:main Q:Reg/w0 : Action 15 transitioned to state: rdy
9533.048794753:main Q:Reg/w0 : Action 15 transitioned to state: itx
9533.048810943:main Q:Reg/w0 : Action 15 transitioned to state: rdy
9533.048827085:main Q:Reg/w0 : actionCommit, in retry loop, iRet 0
9533.048842385:main Q:Reg/w0 : actionCommitAll: action 17, state 0, nbr to commit 0 isTransactional 0
9533.048848882:main Q:Reg/w0 : processBATCH: batch of 1 elements has been processed
9533.048865523:main Q:Reg/w0 : regular consumer finished, iret=0, szlog 0 sz phys 1
9533.048883876:main Q:Reg/w0 : DeleteProcessedBatch: we deleted 1 objects and enqueued 0 objects
9533.048900724:main Q:Reg/w0 : doDeleteBatch: delete batch from store, new sizes: log 0, phys 0
9533.048917314:main Q:Reg/w0 : regular consumer finished, iret=4, szlog 0 sz phys 0
9533.048923512:main Q:Reg/w0 : main Q:Reg/w0: worker IDLE, waiting for work.
9537.087044117:imtcp.c : epoll returned 1 entries
9537.087054376:imtcp.c : epoll push ppusr[0]: 0x180e070
9537.087059193:imtcp.c : tcpsrv: ready to process 1 event entries
9537.087062349:imtcp.c : tcpsrv: processing item 1, pUsr 0x180e070, bAbortConn
9537.087065363:imtcp.c : New connect on NSD 0x18219a0.
9537.087078854:imtcp.c : dnscache: entry (nil) found
9537.087174947:imtcp.c : adding nsdpoll entry 0/0x7f4e5c002af0, sock 11
9537.087182220:imtcp.c : New session created with NSD 0x7f4e5c002af0.
9537.087185460:imtcp.c : doing epoll_wait for max 128 events
9537.087612939:imtcp.c : epoll returned 1 entries
9537.087618865:imtcp.c : epoll push ppusr[0]: 0x7f4e5c002af0
9537.087621850:imtcp.c : tcpsrv: ready to process 1 event entries
9537.087624642:imtcp.c : tcpsrv: processing item 0, pUsr 0x7f4e5c002af0, bAbortConn
9537.087636869:imtcp.c : netstream 0x7f4e5c002a20 with new data
9537.087649100:imtcp.c : doing epoll_wait for max 128 events
9537.087705735:imtcp.c : epoll returned 1 entries
9537.087710379:imtcp.c : epoll push ppusr[0]: 0x7f4e5c002af0
9537.087713159:imtcp.c : tcpsrv: ready to process 1 event entries
9537.087715744:imtcp.c : tcpsrv: processing item 0, pUsr 0x7f4e5c002af0, bAbortConn
9537.087718426:imtcp.c : netstream 0x7f4e5c002a20 with new data
9537.087722700:imtcp.c : removing nsdpoll entry 0/0x7f4e5c002af0, sock 11
9537.087742477:imtcp.c : doing epoll_wait for max 128 events
And strace-ing the process shows the only files rsyslog touches are /etc/resolv.conf and /etc/hosts but it did receive my log line though
iznogoud#haproxylogs-xen02:~$ sudo strace -p $(cat /var/run/rsyslogd.pid) -f
Process 7463 attached with 9 threads
[pid 7471] futex(0x7fead1c25004, FUTEX_WAIT_PRIVATE, 1, NULL <unfinished ...>
[pid 7470] futex(0x7fead1c24f9c, FUTEX_WAIT_PRIVATE, 1, NULL <unfinished ...>
[pid 7469] futex(0x7fead1c24f34, FUTEX_WAIT_PRIVATE, 1, NULL <unfinished ...>
[pid 7468] futex(0x7fead1c24ecc, FUTEX_WAIT_PRIVATE, 1, NULL <unfinished ...>
[pid 7467] futex(0x84967c, FUTEX_WAIT_PRIVATE, 11, NULL <unfinished ...>
[pid 7466] epoll_wait(8, <unfinished ...>
[pid 7465] read(4, <unfinished ...>
[pid 7464] select(4, [3], NULL, NULL, NULL <unfinished ...>
[pid 7463] select(1, NULL, NULL, NULL, {577, 636835}
<unfinished ...>
[pid 7466] <... epoll_wait resumed> {{EPOLLIN, {u32=3288344160, u64=140646287418976}}}, 128, -1) = 1
[pid 7466] accept(6, {sa_family=AF_INET6, sin6_port=htons(37578), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, [28]) = 13
[pid 7466] rt_sigprocmask(SIG_BLOCK, [HUP], ~[KILL STOP TTIN RTMIN RT_1], 8) = 0
[pid 7466] open("/etc/resolv.conf", O_RDONLY|O_CLOEXEC) = 14
[pid 7466] fstat(14, {st_mode=S_IFREG|0644, st_size=23, ...}) = 0
[pid 7466] mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fead4506000
[pid 7466] read(14, "nameserver 10.75.164.1\n", 4096) = 23
[pid 7466] read(14, "", 4096) = 0
[pid 7466] close(14) = 0
[pid 7466] munmap(0x7fead4506000, 4096) = 0
[pid 7466] uname({sys="Linux", node="haproxylogs-xen02", ...}) = 0
[pid 7466] open("/etc/hosts", O_RDONLY|O_CLOEXEC) = 14
[pid 7466] fstat(14, {st_mode=S_IFREG|0644, st_size=201, ...}) = 0
[pid 7466] mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fead4506000
[pid 7466] read(14, "127.0.0.1\tlocalhost\n10.75.164.12"..., 4096) = 201
[pid 7466] close(14) = 0
[pid 7466] munmap(0x7fead4506000, 4096) = 0
[pid 7466] rt_sigprocmask(SIG_SETMASK, ~[KILL STOP TTIN RTMIN RT_1], NULL, 8) = 0
[pid 7466] fcntl(13, F_GETFL) = 0x2 (flags O_RDWR)
[pid 7466] fcntl(13, F_SETFL, O_RDWR|O_NONBLOCK) = 0
[pid 7466] epoll_ctl(8, EPOLL_CTL_ADD, 13, {EPOLLIN, {u32=3288345072, u64=140646287419888}}) = 0
[pid 7466] epoll_wait(8, {{EPOLLIN, {u32=3288345072, u64=140646287419888}}}, 128, -1) = 1
# Rsyslog received my test logline as shown below (truncated)
[pid 7466] recvfrom(13, "<5>Jul 10 18:02:01 iznogoud: Mon"..., 131072, MSG_DONTWAIT, NULL, NULL) = 58
[pid 7466] gettimeofday({1499709721, 740339}, NULL) = 0
[pid 7466] epoll_wait(8, {{EPOLLIN, {u32=3288345072, u64=140646287419888}}}, 128, -1) = 1
[pid 7466] recvfrom(13, "", 131072, MSG_DONTWAIT, NULL, NULL) = 0
[pid 7466] epoll_ctl(8, EPOLL_CTL_DEL, 13, 7feac40029f0) = 0
[pid 7466] close(13) = 0
[pid 7466] epoll_wait(8, <unfinished ...>
[pid 7464] <... select resumed> ) = 1 (in [3])
I am missing something obvious ?
Thanks :)

Upgrading rsyslog 8.23 fixed the problem
rsyslogd 8.23.0, compiled with:
PLATFORM: x86_64-pc-linux-gnu
PLATFORM (lsb_release -d):
FEATURE_REGEXP: Yes
GSSAPI Kerberos 5 support: Yes
FEATURE_DEBUG (debug build, slow code): No
32bit Atomic operations supported: Yes
64bit Atomic operations supported: Yes
memory allocator: system default
Runtime Instrumentation (slow code): No
uuid support: Yes
Number of Bits in RainerScript integers: 64

Related

gdb how to break or step on forked processes of stripped binaries

I have a stripped binary which crashes and I want to reverse it. I tried the 'info file' to get the EntryPoint and set a breakpoint there. However a segmentation fault happens on one of the child processes...
[New process 40472]
process 40472 is executing new program: /usr/bin/dpkg
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
[Inferior 2 (process 40472) exited normally]
E: Method http has died unexpectedly!
E: Sub-process http received a segmentation fault.
From the documentation I found the 'show inferior' but I cant find out how to see the specifics of the segfault ? I tried the 'set follow-fork-mode' to chile but it doesnt look like it is helping.
For example I would like to examine the values of the registers such as RIP etc.
Stracing the process produces this:
[pid 54137] writev(3, [{"\0\37", 2}, {"{\346\1\0\0\1\0\0\0\0\0\0\4http\4example\3org\0\0\1\0\1", 31}, {"\0\37", 2}, {"\357\24\1\0\0\1\0\0\0\0\0\0\4http\4example\3org\0\0\34\0\1", 31}], 4) = 66
[pid 54137] read(3, <unfinished ...>
[pid 54134] <... read resumed> "\10\376", 2) = 2
[pid 54134] read(3, "X\250AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2302) = 2302
[pid 54134] close(3) = 0
[pid 54134] --- SIGSEGV {si_signo=SIGSEGV, si_code=SI_KERNEL, si_addr=0} ---
[pid 54134] +++ killed by SIGSEGV +++
[pid 54131] <... select resumed> ) = 1 (in [5], left {0, 425835})
[pid 54131] --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_KILLED, si_pid=54134, si_uid=0, si_status=SIGSEGV, si_utime=0, si_stime=1} ---
close(4)
....
....
....
close(5) = 0
close(4) = 0
write(2, "E", 1E) = 1
write(2, ": ", 2: ) = 2
write(2, "Method http has died unexpectedl"..., 34Method http has died unexpectedly!) = 34
write(2, "\n", 1
) = 1
write(2, "E", 1E) = 1
write(2, ": ", 2: ) = 2
write(2, "Sub-process http received a segm"..., 47Sub-process http received a segmentation fault.) = 47
write(2, "\n", 1
) = 1
close(3) = 0
exit_group(100) = ?
+++ exited with 100 +++

redis operation with ruby is blocking in ppoll

Our project use this apns provider that runing on centos 6.4 to push the oofline msg .
The apns provider just read from redis queue with brpop, then reformat the data and send to the apns msg to apple push service.
Recently, I faced an problem that the apn provider DO NOT read the msg from redis queue, I just strace the process:
The abnormal strace result:
tcp 0 0 ::1:39688 ::1:6379 ESTABLISHED 29452/ruby
[root#server]# strace -p 29452
Process 29452 attached - interrupt to quit
ppoll([{fd=56, events=POLLIN}], 1, NULL, NULL, 8
The normal strace result:
clock_gettime(CLOCK_MONOTONIC, {9266059, 349937955}) = 0
select(9, [8], NULL, NULL, {6, 0}) = 1 (in [8], left {3, 976969})
fcntl64(8, F_GETFL) = 0x802 (flags O_RDWR|O_NONBLOCK)
read(8, "*-1\r\n", 1024) = 5
write(8, "*3\r\n$5\r\nbrpop\r\n$9\r\napn_queue\r\n$1"..., 37) = 37
fcntl64(8, F_GETFL) = 0x802 (flags O_RDWR|O_NONBLOCK)
read(8, 0x9a0e5d8, 1024) = -1 EAGAIN (Resource temporarily unavailable)
clock_gettime(CLOCK_MONOTONIC, {9266061, 374086306}) = 0
select(9, [8], NULL, NULL, {6, 0}^C <unfinished ...>
Process 20493 detached
here is the related code:
loop do
begin
message = #redis.brpop(self.queue, 1)
if message
APN.log(:info, "---------->#{message} ----------->\n")
#notification = APN::Notification.new(JSON.parse(message.last,:symbolize_names => true))
send_notification
end
rescue Exception => e
if e.class == Interrupt || e.class == SystemExit
APN.log(:info, 'Shutting down...')
exit(0)
end
APN.log(:error, "class: #{e.class} Encountered error: #{e}, backtrace #{e.backtrace}")
APN.log(:info, 'Trying to reconnect...')
client.connect!
APN.log(:info, 'Reconnected')
client.push(#notification)
end
end
This problem occur aperiodically , the period time may be one or two month.
I think the code logic is right, guess the system network may affect the normal runnning of programming.
When I use pkill [pid] to kill the programm, it just restore the normal condiction starting read the msg from queue.
Now I don't know how to analyse the problem, so I have to use cron to reboot or send kill signal to the program every dawn periodcally. :(
Can everyone have the idea to handle the problem?
You used in your abnormal strace result ppoll with null timeout .
correct way is
const struct timespec timeout = { .tv_sec = 10, .tv_nsec = 0 };
struct pollfd myfds;
myfds.fd = fd;
myfds.events = POLLIN;
myfds.revents = 0;
retresult = ppoll(&myfds, 1,&timeout,NULL);
This will give 10sec delay once 10sec is finish its return to next code.

Go application hangs when starting subprocess with another user's credentials

I've been using a simple game server management application on Ubuntu 14.04 for the last 6 months or so. After a recent server update & reboot the application would hang on when trying to start a subprocess. After some debugging it seems that whenever I try to start a subprocess with another user's credentials (I'm running as a root) any command will hang.
Here's a simple application to demonstrate what causes the hang:
package main
import (
"os/exec"
"syscall"
"fmt"
)
func main() {
proc := exec.Command("ls")
proc.SysProcAttr = &syscall.SysProcAttr{}
proc.SysProcAttr.Credential = &syscall.Credential{Uid: 1022, Gid: 1023}
err := proc.Run()
if err != nil {
fmt.Printf("err: %v", err)
}
}
By removing the syscall.Credential part, the application will run without any issues.
My question is: is there some platform/update specific reason that causes this behaviour? Is this no longer a correct way to run a subprocess as another user?
EDIT:
Here's the last 5 lines of strace -f
[pid 3994] futex(0xc21000a888, FUTEX_WAKE, 1 <unfinished ...>
[pid 3995] <... futex resumed> ) = 0
[pid 3994] <... futex resumed> ) = 1
[pid 3995] futex(0xc21000a888, FUTEX_WAIT, 0, NULL <unfinished ...>
[pid 3994] select(0, NULL, NULL, NULL, {0, 20}) = 0 (Timeout)
[pid 3994] futex(0x7f615c51a4f8, FUTEX_WAIT, 0, NULL
So apparently if I'm interpreting this right it's blocking at futex_wait.
You should execute your application with strace. So strace myapp and see where it locks up. It could be you have something else that's forking before your application executes, which is causing it to hang.

RStudio Server: Cannot connect to service

I tried to set up RStudio Server on a newly installed Ubuntu 14.04 64bit machine. I followed the instructions but when I browsed the portal, I was told with a popup that I wasn't able to connect to the service.
From traceback:
13737 socket(PF_LOCAL, SOCK_DGRAM|SOCK_CLOEXEC, 0) = 3
13737 connect(3, {sa_family=AF_LOCAL, sun_path="/dev/log"}, 110) = 0
13737 sendto(3, "<11>Oct 14 14:27:59 rsession-roo"..., 283, MSG_NOSIGNAL, NULL, 0) = 283
13737 exit_group(1) = ?
13737 +++ exited with 1 +++
13708 <... rt_sigtimedwait resumed> ) = 17
13708 wait4(13737, [{WIFEXITED(s) && WEXITSTATUS(s) == 1}], WNOHANG, NULL) = 13737
13708 rt_sigtimedwait([INT QUIT TERM CHLD], NULL, NULL, 8 <unfinished ...>
13729 connect(9, {sa_family=AF_LOCAL, sun_path="/tmp/rstudio-rsession/root"}, 28) = -1 ECONNREFUSED (Connection refused)
13728 connect(9, {sa_family=AF_LOCAL, sun_path="/tmp/rstudio-rsession/root"}, 28) = -1 ECONNREFUSED (Connection refused)
From /var/log/syslog:
Oct 14 14:26:42 iZ28xtxldicZ rsession-root[13730]: ERROR system error 13 (Permission denied); OCCURRED AT: int main(int, char* const*) /home/ubuntu/rstudio/src/cpp/session/SessionMain.cpp:3003; LOGGED FROM: int main(int, char* const*) /home/ubuntu/rstudio/src/cpp/session/SessionMain.cpp:3004
Oct 14 14:26:52 iZ28xtxldicZ rserver[13708]: ERROR system error 111 (Connection refused) [request-uri=/rpc/client_init]; OCCURRED AT: void rstudio::core::http::LocalStreamAsyncClient::handleConnect(const boost::system::error_code&) /home/ubuntu/rstudio/src/cpp/core/include/core/http/LocalStreamAsyncClient.hpp:84; LOGGED FROM: void rstudio::server::session_proxy::{anonymous}::logIfNotConnectionTerminated(const rstudio::core::Error&, const rstudio::core::http::Request&) /home/ubuntu/rstudio/src/cpp/server/ServerSessionProxy.cpp:269
Anyone knows what's happening here?
PS: At first it was telling ENOENT over a path to the temp folder, and now it's telling something different after I created it manually.

Groovy startup very slow

I have a problem when I start Groovy on one of my Linux machines - it takes about 30 seconds to execute very simple command:
groovy -e ""
if I run strace on it, here is what I see where it stops and waits:
mprotect(0x7fae284e0000, 4096, PROT_NONE) = 0
clone(child_stack=0x7fae285dfff0, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, parent_tidptr=0x7fae285e09d0, tls=0x7fae285e0700, child_tidptr=0x7fae285e09d0) = 62660
futex(0x7fae285e09d0, FUTEX_WAIT, 62660, NULL <unfinished ...>
Is there a way to figure out what it's waiting for and why and how to fix it?
I am running Red Hat 6.3, Groovy Version: 2.2.1 JVM: 1.7.0_25 Vendor: Oracle Corporation OS: Linux
and here is time command:
bin$ time groovy -e ""
real 0m22.255s
user 0m26.875s
sys 0m2.064s
EDITED:
as per the suggestion, did strace -f, here is what I see:
[pid 49451] <... gettimeofday resumed> {1397076179, 998954}, NULL) = 0
[pid 49482] clock_gettime(CLOCK_MONOTONIC, <unfinished ...>
[pid 49451] gettimeofday( <unfinished ...>
[pid 49482] <... clock_gettime resumed> {10719052, 15135866}) = 0
[pid 49451] <... gettimeofday resumed> {1397076180, 871}, NULL) = 0
[pid 49482] gettimeofday({1397076180, 2272}, NULL) = 0
[pid 49451] gettimeofday( <unfinished ...>
[pid 49482] futex(0x7fde3c145554, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 1, {1397076180, 52272000}, ffffffff <unfinished ...>
[pid 49451] <... gettimeofday resumed> {1397076180, 3226}, NULL) = 0
[pid 49451] gettimeofday({1397076180, 5444}, NULL) = 0
[pid 49451] gettimeofday({1397076180, 7123}, NULL) = 0
[pid 49451] gettimeofday({1397076180, 8765}, NULL) = 0
[pid 49451] gettimeofday({1397076180, 9766}, NULL) = 0
[pid 49451] gettimeofday({1397076180, 10650}, NULL) = 0
[pid 49451] gettimeofday({1397076180, 11611}, NULL) = 0
[pid 49451] gettimeofday({1397076180, 12648}, NULL) = 0
[pid 49451] gettimeofday({1397076180, 13569}, NULL) = 0
[pid 49451] gettimeofday({1397076180, 14450}, NULL) = 0
[pid 49451] gettimeofday({1397076180, 16851}, NULL) = 0
[pid 49451] gettimeofday({1397076180, 17891}, NULL) = 0
[pid 49451] gettimeofday({1397076180, 19012}, NULL) = 0
[pid 49451] gettimeofday({1397076180, 20415}, NULL) = 0
[pid 49451] gettimeofday({1397076180, 21734}, NULL) = 0
looks like it's waiting for gettimeofday, I see a lot of this in the trace.
and here is how it ends:
[pid 49475] gettimeofday({1397076182, 86016}, NULL) = 0
[pid 49475] futex(0x7fde3c008754, FUTEX_WAKE_OP_PRIVATE, 1, 1, 0x7fde3c008750, {FUTEX_OP_SET, 0, FUTEX_OP_CMP_GT, 1}) = 1
[pid 49451] <... futex resumed> ) = 0
[pid 49475] madvise(0x7fddf09d6000, 1028096, MADV_DONTNEED <unfinished ...>
[pid 49451] futex(0x7fde3c008728, FUTEX_WAKE_PRIVATE, 1 <unfinished ...>
[pid 49475] <... madvise resumed> ) = 0
[pid 49451] <... futex resumed> ) = 0
[pid 49475] _exit(0) = ?
Process 49475 detached
[pid 49451] rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
[pid 49451] unlink("/tmp/mydirectory/49439") = 0
[pid 49451] madvise(0x7fde42dcc000, 1028096, MADV_DONTNEED) = 0
[pid 49451] _exit(0) = ?
Process 49451 detached
[pid 49439] <... futex resumed> ) = 0
[pid 49439] exit_group(0) = ?
The large number of gettimeofday calls to might be related to this bug report.
You can try the suggested workaround and start Groovy with the -XX:-UsePerfData JVM option.
If it will not help reducing the startup time, at least it will clear the strace output and may help getting better information from it.

Resources