#!/bin/zsh

CSV_DIR="csv"
OUTPUT="output.csv"

# Find all CSV files in the directory
FILES=($CSV_DIR/*.csv)
if [[ ! -d "$CSV_DIR" || ! -e "${FILES[1]}" ]]; then
  echo "No CSV files found in '$CSV_DIR'."
  exit 1
fi

# Write header from the first file
head -n 1 "${FILES[1]}" > "$OUTPUT"

typeset -A seen_rows
file_count=0
row_count=0
output_line=2 # Start after header

total_start=$(date +%s)

for f in $FILES; do
  ((file_count++))
  echo "Processing file: $f"
  file_start=$(date +%s)
  line_num=0
  file_data_rows=0
  while IFS= read -r line || [[ -n "$line" ]]; do
    ((line_num++))
    if [[ $line_num -eq 1 ]]; then
      continue # skip header
    fi
    # Check for duplicate
    if [[ -n ${seen_rows["$line"]} ]]; then
      echo "Duplicate row detected at output line $output_line (from $f, input line $line_num): $line"
      continue
    fi
    seen_rows["$line"]=1
    echo "$line" >> "$OUTPUT"
    ((row_count++))
    ((output_line++))
    ((file_data_rows++))
  done < "$f"
  file_end=$(date +%s)
  file_elapsed=$((file_end - file_start))
  echo "Finished processing $f ($file_data_rows data rows added) in ${file_elapsed}s"
done

total_end=$(date +%s)
total_elapsed=$((total_end - total_start))

echo "Files processed: $file_count"
echo "Total data rows in $OUTPUT: $row_count"
echo "Total time: ${total_elapsed}s"
