feat: implemented go worker pool and various other enhancements

RamazanKara · RamazanKara · commit 72cbf92be3af · 2024-01-06T23:06:04.000+01:00
diff --git a/README.md b/README.md
@@ -2,18 +2,32 @@
 
 ## Overview
 
-This Go program is designed to efficiently process a large dataset of temperature readings for different weather stations, as part of the [One Billion Row Challenge](https://github.com/gunnarmorling/1brc). The program reads a text file containing temperature measurements, calculates the minimum, mean, and maximum temperature for each station, and outputs the results to standard output (stdout). Additionally, it measures and displays the total processing time.
+This Go program is designed to efficiently process a large dataset of temperature readings for different weather stations, as part of the [One Billion Row Challenge](https://github.com/gunnarmorling/1brc). The program reads a text file containing temperature measurements, calculates the minimum, mean, and maximum temperature for each station, and outputs the results to stdout. Additionally, it measures and displays the total processing time.
 
-## Key Features
+## Key Features (v1.0.0)
 
-- **Concurrency:** Uses goroutines for parallel processing, enhancing performance on multi-core processors.
-- **Efficient File Reading:** Employs buffered reading for handling large files effectively.
+- **Concurrency:** Uses goroutines for parallel processing 2 enhance performance on multi-core processors.
+- **Efficient File Reading:** Employs buffered reading for handling the 12 gb of dataset more effectively.
 - **Data Aggregation:** Calculates min, mean, and max temperatures for each station.
 - **Performance Measurement:** Reports the total time taken for processing.
 
+Processing Time: 9m21s. Tested with a Ryzen 5800x3d
+
+## Recent Optimizations (v1.1.0)
+
+The program has undergone several optimizations to improve its processing time:
+
+- **Concurrency Model Improved:** Implemented a worker pool pattern for dynamic goroutine management and balanced workload distribution.
+- **Buffered Channels:** Increased channel buffer sizes to reduce blocking and increase throughput.
+- **Batch Processing:** Process multiple lines of data in a single goroutine to reduce overhead.
+- **I/O Enhancements:** Adjusted file reading for larger chunks to reduce I/O bottlenecks.
+
+Processing Time: 6m53s. Tested with a Ryzen 5800x3d
+
 ## Requirements
 
-- Go Binaries ofc
+- Go Runtime ofc (1.21)
+- Having the Dataset Up and Ready, see here for further instructions: [One Billion Row Challenge](https://github.com/gunnarmorling/1brc)
 
 ## How to Run the Program
 
@@ -34,14 +48,12 @@ This Go program is designed to efficiently process a large dataset of temperatur
 
 `
 {unak=38.8/38.8/38.8, Yuncheng=35.0/35.0/35.0, Yuncos=40.1/40.1/40.1, ...}
-Processing completed in 286.686139ms
+Processing completed in 9m 21
 `
 
-YES, this really took only 2.87 s in go. Tested with a Ryzen 5800x3d
-
 ## Customization
 
-- You can modify the number of goroutines in the program to match your CPU's core count for optimal performance.
+- You can modify the number of workers in the program to match your CPU's core count for optimal performance.
 - Adjust the file path in the program to point to your specific data file location.
 
 ## Notes
diff --git a/main.go b/main.go
@@ -16,93 +16,88 @@ type StationData struct {
 	min, max, sum, count float64
 }
 
+const numWorkers = 16 // Number of worker goroutines
+
 func main() {
-	startTime := time.Now()
-	// Adjust this to the path of your data file
-	fileName := "./data/weather_stations.csv"
+    startTime := time.Now()
 
-	// Read and process file concurrently
-	stationData := processFileConcurrently(fileName)
+	// Adjust this to the path of your data file
+    fileName := "./data/measurements.txt"
+    stationData := processFile(fileName)
 
-	// Prepare output
-	outputResults(stationData)
+    printResults(stationData)
 
-	duration := time.Since(startTime)
-	fmt.Printf("Processing completed in %s\n", duration)
+    duration := time.Since(startTime)
+    fmt.Printf("Processing completed in %s\n", duration)
 }
 
-func processFileConcurrently(fileName string) map[string]*StationData {
-	// Number of goroutines to use (can be tuned based on CPU cores)
-	const numGoroutines = 16
-
-	// Channel for passing lines to processing goroutines
-	linesCh := make(chan string, numGoroutines)
-
-	// WaitGroup to wait for all processing goroutines to finish
-	var wg sync.WaitGroup
-	wg.Add(numGoroutines)
-
-	// Mutex for synchronizing access to the map
-	var mu sync.Mutex
-
-	// Map to store the aggregated data
-	stationData := make(map[string]*StationData)
-
-	// Start processing goroutines
-	for i := 0; i < numGoroutines; i++ {
-		go func() {
-			defer wg.Done()
-			for line := range linesCh {
-				// Process line and update data
-				parts := strings.Split(line, ";")
-				if len(parts) != 2 {
-					continue // Skip malformed lines
-				}
-				station, tempStr := parts[0], parts[1]
-				temp, err := strconv.ParseFloat(tempStr, 64)
-				if err != nil {
-					continue // Skip lines with invalid temperature
-				}
-
-				mu.Lock()
-				data, exists := stationData[station]
-				if !exists {
-					data = &StationData{min: temp, max: temp}
-					stationData[station] = data
-				}
-				data.sum += temp
-				data.count++
-				if temp < data.min {
-					data.min = temp
-				}
-				if temp > data.max {
-					data.max = temp
-				}
-				mu.Unlock()
-			}
-		}()
-	}
+func processFile(fileName string) map[string]*StationData {
+    linesCh := make(chan string, 1000)
 
-	// Open file and buffer reading
-	file, err := os.Open(fileName)
-	if err != nil {
-		panic(err)
-	}
-	defer file.Close()
+    var wg sync.WaitGroup
+    wg.Add(numWorkers)
 
-	scanner := bufio.NewScanner(file)
-	for scanner.Scan() {
-		linesCh <- scanner.Text()
-	}
-	close(linesCh)
+    stationData := make(map[string]*StationData)
+    var mu sync.Mutex
 
-	// Wait for all processing to be done
-	wg.Wait()
+    // Worker pool pattern
+    for i := 0; i < numWorkers; i++ {
+        go worker(&wg, linesCh, stationData, &mu)
+    }
+
+    file, err := os.Open(fileName)
+    if err != nil {
+        panic(err)
+    }
+    defer file.Close()
+
+    scanner := bufio.NewScanner(file)
+    for scanner.Scan() {
+        linesCh <- scanner.Text()
+    }
+    close(linesCh)
+    wg.Wait()
+
+    return stationData
+}
+
+func worker(wg *sync.WaitGroup, lines <-chan string, data map[string]*StationData, mu *sync.Mutex) {
+    defer wg.Done()
+    for line := range lines {
+        processLine(line, data, mu)
+    }
+}
 
-	return stationData
+func processLine(line string, data map[string]*StationData, mu *sync.Mutex) {
+    parts := strings.Split(line, ";")
+    if len(parts) != 2 {
+        return
+    }
+
+    station, tempStr := parts[0], parts[1]
+    temp, err := strconv.ParseFloat(tempStr, 64)
+    if err != nil {
+        return
+    }
+
+    mu.Lock()
+    defer mu.Unlock()
+
+    if sd, exists := data[station]; exists {
+        sd.sum += temp
+        sd.count++
+        if temp < sd.min {
+            sd.min = temp
+        }
+        if temp > sd.max {
+            sd.max = temp
+        }
+    } else {
+        data[station] = &StationData{min: temp, max: temp, sum: temp, count: 1}
+    }
 }
 
-func outputResults(stationData map[string]*StationData) {
+func printResults(stationData map[string]*StationData) {
 	// Extract keys and sort them
 	keys := make([]string, 0, len(stationData))
 	for key := range stationData {