Jim Wright

Discussing all things around software engineering.

Graceful exits in GO

Posted on
Reading time 5 minutes


An exit sign

How to gracefully exit long running processes in GO

In this post I’m going to explain why we want graceful exists in programs, and how to implement them in GO.

Without gracefully exiting our programs we could leave things in an inconsistent state.

No handling

Below is a simple program that emulates doing some work by sleeping and has no exit handling.

package main

import (
	"fmt"
	"sync"
	"time"
)

func work(wg *sync.WaitGroup, workCh <-chan int) {
	for i := range workCh {
		fmt.Printf("Starting %d\n", i)
		// Do something...
		time.Sleep(time.Second)
		fmt.Printf("Finished %d\n", i)
		wg.Done()
	}
}

func main() {
	wg := &sync.WaitGroup{}
	workCh := make(chan int)
	go work(wg, workCh)
	for i := 1; i <= 10; i++ {
		wg.Add(1)
		workCh <- i
	}
	wg.Wait()
}

What happens when you ctrl + c with no handling?

Starting 1
Finished 1
Starting 2
^Csignal: interrupt

As you can see we were unable to finish job #2. Depending on what type of work you do, this could leave your program or database in a bad state.

Batch handling

The following method will iterate over batches adding an incrementing integer to the work. If cancelled, the current batch will complete but exit before any others can start.

package main

import (
	"context"
	"fmt"
	"os"
	"os/signal"
	"sync"
	"syscall"
	"time"
)

func work(wg *sync.WaitGroup, workCh <-chan int) {
	for i := range workCh {
		fmt.Printf("Starting %d\n", i)
		// Do something...
		time.Sleep(time.Millisecond * 200)
		fmt.Printf("Finished %d\n", i)
		wg.Done()
	}
}

func handleShutdownSignal(shutdownCh chan struct{}) {
	quitCh := make(chan os.Signal)
	signal.Notify(quitCh, os.Interrupt, syscall.SIGTERM)
	<-quitCh
	close(shutdownCh)
}

func main() {
	wg := &sync.WaitGroup{}
	workCh := make(chan int)

	ctx := context.Background()
	ctx, cancel := context.WithCancel(ctx)

	shutdownCh := make(chan struct{})
	go handleShutdownSignal(shutdownCh)
	go func() {
		<-shutdownCh
		cancel()
	}()

	go work(wg, workCh)

    n := 1
	for batch := 1; batch <= 3; batch++ {
		select {
		case <-ctx.Done():
			fmt.Printf("Cancelled before batch %d can produce\n", batch)
			break
		default:
			fmt.Printf("Starting batch %d\n", batch)
			for i := 1; i <= 10; i++ {
				wg.Add(1)
				workCh <- n
				n += 1
			}
		}
	}
	wg.Wait()
	fmt.Println("Finished")
}

Now what happens when we ctrl + c?

Starting batch 1
Starting 1
...
Finished 9
Starting batch 2
Starting 10
...
Starting 13
^CFinished 13
Starting 14
...
Finished 19
Starting 20
Cancelled before batch 3 can produce
Finished 20
Finished

Exiting a long running process/daemon

When writing a daemon you don’t typically expect it to exit unless there has been an error, or was cancelled. Because of this we have removed all code relating to the process being ‘complete’. You will also notice that there is now a goroutine adding items to the channel to be processed. You should imagine that this is emulating HTTP requests or from a message queue.

In the following example, when we receive a sigint we will stop writing to the channel and finish of the work in workCh. A key thing to note here is the buffer size of workCh. This directly relates to the maximum time we could need to wait for the shutdown.

package main

import (
	"context"
	"fmt"
	"os"
	"os/signal"
	"sync"
	"syscall"
	"time"
)

func work(wg *sync.WaitGroup, workCh <-chan int) {
	for i := range workCh {
		fmt.Printf("Starting %d\n", i)
		// Do something...
		time.Sleep(time.Second)
		fmt.Printf("Finished %d\n", i)
		wg.Done()
	}
}

func produce(wg *sync.WaitGroup, workCh chan<- int, ctx context.Context) {
	for i := 1; i <= 99999999; i++ {
		select {
		case <-ctx.Done():
			break
		default:
			wg.Add(1)
			workCh <- i
		}
	}
}

func handleShutdownSignal(shutdownCh chan struct{}) {
	quitCh := make(chan os.Signal)
	signal.Notify(quitCh, os.Interrupt, syscall.SIGTERM)
	<-quitCh
	close(shutdownCh)
}

func main() {
	wg := &sync.WaitGroup{}
	workCh := make(chan int, 10)

	ctx := context.Background()
	ctx, cancel := context.WithCancel(ctx)

	shutdownCh := make(chan struct{})
	go handleShutdownSignal(shutdownCh)
	go func() {
		<-shutdownCh
		cancel()
	}()

	go produce(wg, workCh, ctx)
	go work(wg, workCh)

	<-ctx.Done()
	
	fmt.Println("Waiting for shutdown")
	wg.Wait()
	fmt.Println("All services shutdown")
}

As you can see, once we hit ctrl + c the remaining items in the channel buffer were processed.

Starting 1
Finished 1
Starting 2
^CWaiting for shutdown
Finished 2
Starting 3
...
Finished 13
All services shutdown

What happens if we put a delay in the producer?

Starting 1
Finished 1
Starting 2
^CWaiting for shutdown
Finished 2
Starting 3
Finished 3
All services shutdown

It took longer to add items to the buffer, so when we cancelled there were less items processed as the buffer wasn’t full.

Can’t I ignore what is left in the buffer?

If you want to exit straight away, you should be aware that it is possible you will suffer from data loss as your channel buffer won’t be processed. That said, take a look at the following example:

func handleShutdownSignal(shutdownCh chan struct{}) {
	quitCh := make(chan os.Signal)
	signal.Notify(quitCh, os.Interrupt, syscall.SIGTERM)

	startedShutdown := false
	for {
		<-quitCh
		if startedShutdown {
			os.Exit(0)
		}
		close(shutdownCh)
		startedShutdown = true
	}
}

When you hit ctrl + c it will initiate the normal behaviour, but if done again it will exit immediately.

Starting 1
Finished 1
Starting 2
^CWaiting for shutdown
^C

What about restricting the time I want to wait for the buffer to empty?

If you have a large buffer size or you don’t want to wait a while for your buffer to be processed you can put a time limit in place.

package main

import (
	"context"
	"fmt"
	"os"
	"os/signal"
	"sync"
	"syscall"
	"time"
)

func work(wg *sync.WaitGroup, workCh <-chan int) {
	for i := range workCh {
		fmt.Printf("Starting %d\n", i)
		// Do something...
		time.Sleep(time.Second)
		fmt.Printf("Finished %d\n", i)
		wg.Done()
	}
}

func produce(wg *sync.WaitGroup, workCh chan<- int, ctx context.Context) {
	for i := 1; i <= 99999999; i++ {
		select {
		case <-ctx.Done():
			break
		default:
			wg.Add(1)
			workCh <- i
		}
	}
}

func handleShutdownSignal(shutdownCh chan struct{}) {
	quitCh := make(chan os.Signal)
	signal.Notify(quitCh, os.Interrupt, syscall.SIGTERM)

	startedShutdown := false
	for {
		<-quitCh
		if startedShutdown {
			os.Exit(0)
		}
		close(shutdownCh)
		startedShutdown = true
	}
}

func main() {
	wg := &sync.WaitGroup{}
	workCh := make(chan int, 10)

	ctx := context.Background()
	ctx, cancel := context.WithCancel(ctx)

	shutdownCh := make(chan struct{})
	go handleShutdownSignal(shutdownCh)
	go func() {
		<-shutdownCh
		cancel()
	}()

	go produce(wg, workCh, ctx)
	go work(wg, workCh)

	<-ctx.Done()

	doneCh := make(chan struct{})
	go func() {
		wg.Wait()
		close(doneCh)
	}()

	fmt.Println("Waiting for shutdown")

	select {
	case <-time.After(time.Second * 10):
		fmt.Println("Took too long to shutdown")
	case <-doneCh:
	}
	fmt.Println("All services shutdown")
}

If you found this interesting...

You might like to read the rest of learning-go