commit 2c8fb90b1e0dde9cdb762321686cdff591863d11 from: Omar Polo via: Thomas Adam date: Sun Jun 25 17:34:21 2023 UTC gotd: wait asynchronously for children termination Instead of the current kill() + waitpid(WNOHANG), manage the subprocesses in a separate queue and handle SIGCHLD. A timer is installed to ensure that misbehaving subprocesses are still killed. Fixes the current "child PID 0 terminated" logs due to races with waitpid(). Issue initially reported by Josiah Frentsos. ok + tweaks stsp@ commit - 94d1a66aaad059d61f5a244f4ffbe0607ae52d59 commit + 2c8fb90b1e0dde9cdb762321686cdff591863d11 blob - 4b56d0042209d042290978d8010c2f766591ae48 blob + f096baf28be2970ac75987af00fbdf3834130ea1 --- gotd/gotd.c +++ gotd/gotd.c @@ -77,7 +77,11 @@ struct gotd_child_proc { char repo_path[PATH_MAX]; int pipe[2]; struct gotd_imsgev iev; + struct event tmo; + + TAILQ_ENTRY(gotd_child_proc) entry; }; +TAILQ_HEAD(gotd_procs, gotd_child_proc) procs; struct gotd_client { STAILQ_ENTRY(gotd_client) entry; @@ -110,6 +114,7 @@ static const struct got_error *start_repo_child(struct static const struct got_error *start_auth_child(struct gotd_client *, int, struct gotd_repo *, char *, const char *, int, int); static void kill_proc(struct gotd_child_proc *, int); +static void disconnect(struct gotd_client *); __dead static void usage(void) @@ -273,77 +278,62 @@ ensure_client_is_not_reading(struct gotd_client *clien } static void -wait_for_child(pid_t child_pid) +proc_done(struct gotd_child_proc *proc) { - pid_t pid; - int status; + struct gotd_client *client; - log_debug("waiting for child PID %ld to terminate", - (long)child_pid); + TAILQ_REMOVE(&procs, proc, entry); - do { - pid = waitpid(child_pid, &status, WNOHANG); - if (pid == -1) { - if (errno != EINTR && errno != ECHILD) - fatal("wait"); - } else if (WIFSIGNALED(status)) { - log_warnx("child PID %ld terminated; signal %d", - (long)pid, WTERMSIG(status)); - } - } while (pid != -1 || (pid == -1 && errno == EINTR)); -} + client = find_client_by_proc_fd(proc->iev.ibuf.fd); + if (client != NULL) { + if (proc == client->repo) + client->repo = NULL; + if (proc == client->auth) + client->auth = NULL; + if (proc == client->session) + client->session = NULL; + disconnect(client); + } -static void -proc_done(struct gotd_child_proc *proc) -{ - event_del(&proc->iev.ev); - msgbuf_clear(&proc->iev.ibuf.w); - close(proc->iev.ibuf.fd); - kill_proc(proc, 0); - wait_for_child(proc->pid); + evtimer_del(&proc->tmo); + + if (proc->iev.ibuf.fd != -1) { + event_del(&proc->iev.ev); + msgbuf_clear(&proc->iev.ibuf.w); + close(proc->iev.ibuf.fd); + } + free(proc); } static void kill_repo_proc(struct gotd_client *client) { - struct gotd_child_proc *proc; - if (client->repo == NULL) return; - proc = client->repo; + kill_proc(client->repo, 0); client->repo = NULL; - - proc_done(proc); } static void kill_auth_proc(struct gotd_client *client) { - struct gotd_child_proc *proc; - if (client->auth == NULL) return; - proc = client->auth; + kill_proc(client->auth, 0); client->auth = NULL; - - proc_done(proc); } static void kill_session_proc(struct gotd_client *client) { - struct gotd_child_proc *proc; - if (client->session == NULL) return; - proc = client->session; + kill_proc(client->session, 0); client->session = NULL; - - proc_done(proc); } static void @@ -742,6 +732,20 @@ static const char *gotd_proc_names[PROC_MAX] = { static void kill_proc(struct gotd_child_proc *proc, int fatal) { + struct timeval tv = { 5, 0 }; + + log_debug("kill -%d %d", fatal ? SIGKILL : SIGTERM, proc->pid); + + if (proc->iev.ibuf.fd != -1) { + event_del(&proc->iev.ev); + msgbuf_clear(&proc->iev.ibuf.w); + close(proc->iev.ibuf.fd); + proc->iev.ibuf.fd = -1; + } + + if (!evtimer_pending(&proc->tmo, NULL) && !fatal) + evtimer_add(&proc->tmo, &tv); + if (fatal) { log_warnx("sending SIGKILL to PID %d", proc->pid); kill(proc->pid, SIGKILL); @@ -750,9 +754,18 @@ kill_proc(struct gotd_child_proc *proc, int fatal) } static void +kill_proc_timeout(int fd, short ev, void *d) +{ + struct gotd_child_proc *proc = d; + + log_warnx("timeout waiting for PID %d to terminate;" + " retrying with force", proc->pid); + kill_proc(proc, 1); +} + +static void gotd_shutdown(void) { - struct gotd_child_proc *proc; uint64_t slot; log_debug("shutting down"); @@ -763,20 +776,31 @@ gotd_shutdown(void) disconnect(c); } - proc = gotd.listen_proc; - msgbuf_clear(&proc->iev.ibuf.w); - close(proc->iev.ibuf.fd); - kill_proc(proc, 0); - wait_for_child(proc->pid); - free(proc); + kill_proc(gotd.listen_proc, 0); log_info("terminating"); exit(0); } +static struct gotd_child_proc * +find_proc_by_pid(pid_t pid) +{ + struct gotd_child_proc *proc = NULL; + + TAILQ_FOREACH(proc, &procs, entry) + if (proc->pid == pid) + break; + + return proc; +} + void gotd_sighdlr(int sig, short event, void *arg) { + struct gotd_child_proc *proc; + pid_t pid; + int status; + /* * Normal signal handler rules don't apply because libevent * decouples for us. @@ -793,6 +817,35 @@ gotd_sighdlr(int sig, short event, void *arg) case SIGINT: gotd_shutdown(); break; + case SIGCHLD: + for (;;) { + pid = waitpid(WAIT_ANY, &status, WNOHANG); + if (pid == -1) { + if (errno == EINTR) + continue; + if (errno == ECHILD) + break; + fatal("waitpid"); + } + if (pid == 0) + break; + + log_debug("reaped pid %d", pid); + proc = find_proc_by_pid(pid); + if (proc == NULL) { + log_info("caught exit of unknown child %d", + pid); + continue; + } + + if (WIFSIGNALED(status)) { + log_warnx("child PID %d terminated with" + " signal %d", pid, WTERMSIG(status)); + } + + proc_done(proc); + } + break; default: fatalx("unexpected signal"); } @@ -1505,6 +1558,10 @@ start_listener(char *argv0, const char *confpath, int if (proc == NULL) fatal("calloc"); + TAILQ_INSERT_HEAD(&procs, proc, entry); + + /* proc->tmo is initialized in main() after event_init() */ + proc->type = PROC_LISTEN; if (socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, @@ -1531,6 +1588,9 @@ start_session_child(struct gotd_client *client, struct if (proc == NULL) return got_error_from_errno("calloc"); + TAILQ_INSERT_HEAD(&procs, proc, entry); + evtimer_set(&proc->tmo, kill_proc_timeout, proc); + if (client_is_reading(client)) proc->type = PROC_SESSION_READ; else @@ -1577,6 +1637,9 @@ start_repo_child(struct gotd_client *client, enum gotd if (proc == NULL) return got_error_from_errno("calloc"); + TAILQ_INSERT_HEAD(&procs, proc, entry); + evtimer_set(&proc->tmo, kill_proc_timeout, proc); + proc->type = proc_type; if (strlcpy(proc->repo_name, repo->name, sizeof(proc->repo_name)) >= sizeof(proc->repo_name)) @@ -1628,6 +1691,9 @@ start_auth_child(struct gotd_client *client, int requi close(fd); return err; } + + TAILQ_INSERT_HEAD(&procs, proc, entry); + evtimer_set(&proc->tmo, kill_proc_timeout, proc); proc->type = PROC_AUTH; if (strlcpy(proc->repo_name, repo->name, @@ -1729,10 +1795,12 @@ main(int argc, char **argv) struct passwd *pw = NULL; char *repo_path = NULL; enum gotd_procid proc_id = PROC_GOTD; - struct event evsigint, evsigterm, evsighup, evsigusr1; + struct event evsigint, evsigterm, evsighup, evsigusr1, evsigchld; int *pack_fds = NULL, *temp_fds = NULL; struct gotd_repo *repo = NULL; + TAILQ_INIT(&procs); + log_init(1, LOG_DAEMON); /* Log to stderr until daemonized. */ while ((ch = getopt(argc, argv, "Adf:LnP:RsSvW")) != -1) { @@ -1952,18 +2020,23 @@ main(int argc, char **argv) if (proc_id != PROC_GOTD) fatal("invalid process id %d", proc_id); + evtimer_set(&gotd.listen_proc->tmo, kill_proc_timeout, + gotd.listen_proc); + apply_unveil_selfexec(); signal_set(&evsigint, SIGINT, gotd_sighdlr, NULL); signal_set(&evsigterm, SIGTERM, gotd_sighdlr, NULL); signal_set(&evsighup, SIGHUP, gotd_sighdlr, NULL); signal_set(&evsigusr1, SIGUSR1, gotd_sighdlr, NULL); + signal_set(&evsigchld, SIGCHLD, gotd_sighdlr, NULL); signal(SIGPIPE, SIG_IGN); signal_add(&evsigint, NULL); signal_add(&evsigterm, NULL); signal_add(&evsighup, NULL); signal_add(&evsigusr1, NULL); + signal_add(&evsigchld, NULL); gotd_imsg_event_add(&gotd.listen_proc->iev);