From df14812db277234fe6a3deb0eb9c1ca96d433197 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Fri, 12 Dec 2025 16:18:06 -0800
Subject: [PATCH 1/4] Add support to model AA haplotypes

Reorganizes workflow to support fitting the MLR with different variant
classifications as in the forecasts-ncov workflow. In addition to the
original "emerging_haplotype" variant classification, I've added a
"aa_haplotype" classification which uses more granular, automated
haplotype assignments based on current clade annotation and all HA1
substitutions from that parent clade. We will likely need to tune the
minimum number of "clade" sequences per AA haplotype to allow rarer
haplotypes to appear in the analysis. For now, I've kept the same
thresholds for both variant classifications, though.

As part of this reorganization, I've also added support for different
data provenances.

I also realized that it did not make sense to implement different date
thresholds for each potential model output; all of the analyses we run
should represent the same time span. For this reason, I've moved
"min_date" and "max_date" to the top-level config.
---
 Snakefile                       | 152 ++++++++++++++++++--------------
 config/defaults.yaml            |  57 +++++++-----
 scripts/add_colors_to_model.py  |  34 +++++--
 scripts/assign_aa_haplotypes.py | 104 ++++++++++++++++++++++
 workflow/upload.smk             |  12 +--
 5 files changed, 259 insertions(+), 100 deletions(-)
 create mode 100644 scripts/assign_aa_haplotypes.py

diff --git a/Snakefile b/Snakefile
index 1427850..3f577c2 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1,7 +1,10 @@
 configfile: "config/defaults.yaml"
 
 wildcard_constraints:
-    date = r"\d{4}-\d{2}-\d{2}"
+    data_provenance=r"(gisaid)",
+    variant_classification=r"(emerging_haplotype|aa_haplotype)",
+    lineage=r"(h1n1pdm|h3n2|vic)",
+    date=r"\d{4}-\d{2}-\d{2}"
 
 def get_todays_date():
     from datetime import datetime
@@ -13,24 +16,24 @@ run_date = config.get("run_date", get_todays_date())
 if config.get("s3_dst"):
     rule upload_all_models:
         input:
-            expand("results/{lineage}/{geo_resolution}/mlr/{date}_results_s3_upload.done", lineage=config["lineages"], geo_resolution=config["geo_resolutions"], date=run_date),
-            expand("results/{lineage}/{geo_resolution}/mlr/results_s3_upload.done", lineage=config["lineages"], geo_resolution=config["geo_resolutions"])
+            expand("results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/mlr/{date}_results_s3_upload.done", data_provenance=config["data_provenances"], variant_classification=config["variant_classifications"], lineage=config["lineages"], geo_resolution=config["geo_resolutions"], date=run_date),
+            expand("results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/mlr/results_s3_upload.done", data_provenance=config["data_provenances"], variant_classification=config["variant_classifications"], lineage=config["lineages"], geo_resolution=config["geo_resolutions"])
 else:
     rule all:
         input:
-            expand("plots/{lineage}/{geo_resolution}/ga/ga_by_variant.png", lineage=config["lineages"], geo_resolution=config["geo_resolutions"]),
-            expand("plots/{lineage}/{geo_resolution}/ga/ga_by_location.png", lineage=config["lineages"], geo_resolution=config["geo_resolutions"]),
-            expand("plots/{lineage}/{geo_resolution}/freq/freq_by_location.png", lineage=config["lineages"], geo_resolution=config["geo_resolutions"]),
+            expand("plots/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/ga/ga_by_variant.png", data_provenance=config["data_provenances"], variant_classification=config["variant_classifications"], lineage=config["lineages"], geo_resolution=config["geo_resolutions"]),
+            expand("plots/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/ga/ga_by_location.png", data_provenance=config["data_provenances"], variant_classification=config["variant_classifications"], lineage=config["lineages"], geo_resolution=config["geo_resolutions"]),
+            expand("plots/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/freq/freq_by_location.png", data_provenance=config["data_provenances"], variant_classification=config["variant_classifications"], lineage=config["lineages"], geo_resolution=config["geo_resolutions"]),
 
 rule all_models:
     input:
-        expand("results/{lineage}/{geo_resolution}/mlr/MLR_results.json", lineage=config["lineages"], geo_resolution=config["geo_resolutions"]),
+        expand("results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/mlr/MLR_results.json", data_provenance=config["data_provenances"], variant_classification=config["variant_classifications"], lineage=config["lineages"], geo_resolution=config["geo_resolutions"]),
 
 rule download_metadata:
     output:
-        "data/{lineage}/metadata.tsv",
+        "data/{data_provenance}/{lineage}/metadata.tsv",
     params:
-        s3_path=lambda wildcards: config["data"][wildcards.lineage]["s3_metadata"]
+        s3_path=lambda wildcards: config["data"][wildcards.data_provenance][wildcards.lineage]["s3_metadata"],
     shell:
         """
         aws s3 cp {params.s3_path} - | xz -c -d > {output}
@@ -38,9 +41,9 @@ rule download_metadata:
 
 rule download_nextclade:
     output:
-        "data/{lineage}/nextclade.tsv",
+        "data/{data_provenance}/{lineage}/nextclade.tsv",
     params:
-        s3_path=lambda wildcards: config["data"][wildcards.lineage]["s3_nextclade"]
+        s3_path=lambda wildcards: config["data"][wildcards.data_provenance][wildcards.lineage]["s3_nextclade"],
     shell:
         """
         aws s3 cp {params.s3_path} - | xz -c -d > {output}
@@ -48,7 +51,7 @@ rule download_nextclade:
 
 rule download_haplotype_definitions:
     output:
-        haplotypes="results/{lineage}/haplotype_definitions.tsv",
+        haplotypes="data/nextstrain/{lineage}/haplotype_definitions.tsv",
     shell:
         """
         curl \
@@ -59,10 +62,10 @@ rule download_haplotype_definitions:
 
 rule metadata_with_nextclade:
     input:
-        metadata="data/{lineage}/metadata.tsv",
-        nextclade="data/{lineage}/nextclade.tsv",
+        metadata="data/{data_provenance}/{lineage}/metadata.tsv",
+        nextclade="data/{data_provenance}/{lineage}/nextclade.tsv",
     output:
-        metadata="data/{lineage}/metadata_with_nextclade.tsv",
+        metadata="data/{data_provenance}/{lineage}/metadata_with_nextclade.tsv",
     shell:
         """
         augur merge \
@@ -73,12 +76,12 @@ rule metadata_with_nextclade:
 
 rule filter_data:
     input:
-        metadata="data/{lineage}/metadata_with_nextclade.tsv",
+        metadata="data/{data_provenance}/{lineage}/metadata_with_nextclade.tsv",
     output:
-        metadata="results/{lineage}/{geo_resolution}/metadata_with_nextclade.tsv",
+        metadata="data/{data_provenance}/{lineage}/filtered_metadata_with_nextclade.tsv",
     params:
-        min_date=lambda wildcards: config["prepare_data"][wildcards.geo_resolution]["min_date"],
-        max_date=lambda wildcards: config["prepare_data"][wildcards.geo_resolution]["max_date"],
+        min_date=lambda wildcards: config["min_date"],
+        max_date=lambda wildcards: config["max_date"],
     shell:
         """
         augur filter \
@@ -89,14 +92,15 @@ rule filter_data:
             --output-metadata {output.metadata}
         """
 
-rule assign_haplotypes:
+rule assign_emerging_haplotypes:
     input:
-        metadata="results/{lineage}/{geo_resolution}/metadata_with_nextclade.tsv",
-        haplotypes="results/{lineage}/haplotype_definitions.tsv",
+        metadata="data/{data_provenance}/{lineage}/filtered_metadata_with_nextclade.tsv",
+        haplotypes="data/nextstrain/{lineage}/haplotype_definitions.tsv",
     output:
-        metadata="results/{lineage}/{geo_resolution}/metadata_with_nextclade_updated.tsv"
+        metadata="data/{data_provenance}/{lineage}/metadata_with_nextclade_with_emerging_haplotypes.tsv",
     params:
         variant_column=config["haplotype_variant_column"],
+        haplotype_column_name="emerging_haplotype",
         default_haplotype="other",
     shell:
         """
@@ -104,19 +108,41 @@ rule assign_haplotypes:
             --substitutions {input.metadata} \
             --haplotypes {input.haplotypes} \
             --clade-column {params.variant_column:q} \
+            --haplotype-column-name {params.haplotype_column_name:q} \
             --default-haplotype {params.default_haplotype:q} \
             --output-table {output.metadata}
         """
 
+rule assign_aa_haplotypes:
+    input:
+        metadata="data/{data_provenance}/{lineage}/metadata_with_nextclade_with_emerging_haplotypes.tsv",
+    output:
+        metadata="data/{data_provenance}/{lineage}/metadata_with_nextclade_with_aa_haplotypes.tsv",
+    params:
+        genes=["HA1"],
+        clade_column=config["haplotype_variant_column"],
+        mutations_column=config["mutations_column"],
+        haplotype_column_name="aa_haplotype",
+    shell:
+        r"""
+        python3 scripts/assign_aa_haplotypes.py \
+            --nextclade {input.metadata:q} \
+            --genes {params.genes:q} \
+            --strip-genes \
+            --clade-column {params.clade_column:q} \
+            --mutations-column {params.mutations_column:q} \
+            --attribute-name {params.haplotype_column_name:q} \
+            --output {output.metadata:q}
+        """
+
 rule clade_seq_counts:
     input:
-        metadata="results/{lineage}/{geo_resolution}/metadata_with_nextclade_updated.tsv",
+        metadata="data/{data_provenance}/{lineage}/metadata_with_nextclade_with_aa_haplotypes.tsv",
     output:
-        sequence_counts="results/{lineage}/{geo_resolution}/seq_counts.tsv",
+        sequence_counts="results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/seq_counts.tsv",
     params:
         id_column="strain",
         date_column="date",
-        variant_column=config["variant"],
     shell:
         """
         ./scripts/summarize-clade-sequence-counts \
@@ -124,20 +150,20 @@ rule clade_seq_counts:
             --id-column {params.id_column:q} \
             --date-column {params.date_column:q} \
             --location-column {wildcards.geo_resolution:q} \
-            --clade-column {params.variant_column:q} \
+            --clade-column {wildcards.variant_classification:q} \
             --output {output.sequence_counts}
             """
 
 rule prepare_clade_data:
     """Preparing clade counts for analysis"""
     input:
-        sequence_counts = "results/{lineage}/{geo_resolution}/seq_counts.tsv"
+        sequence_counts = "results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/seq_counts.tsv"
     output:
-        sequence_counts = "results/{lineage}/{geo_resolution}/prepared_seq_counts.tsv"
+        sequence_counts = "results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/prepared_seq_counts.tsv"
     params:
-        min_date=lambda wildcards: config["prepare_data"][wildcards.geo_resolution]["min_date"],
-        location_min_seq=lambda wildcards: config["prepare_data"][wildcards.geo_resolution]["location_min_seq"],
-        clade_min_seq=lambda wildcards: config["prepare_data"][wildcards.geo_resolution]["clade_min_seq"],
+        min_date=lambda wildcards: config["min_date"],
+        location_min_seq=lambda wildcards: config["prepare_data"][wildcards.data_provenance][wildcards.variant_classification][wildcards.geo_resolution]["location_min_seq"],
+        clade_min_seq=lambda wildcards: config["prepare_data"][wildcards.data_provenance][wildcards.variant_classification][wildcards.geo_resolution]["clade_min_seq"],
     shell:
         """
         python ./scripts/prepare-data.py \
@@ -150,16 +176,16 @@ rule prepare_clade_data:
 
 rule mlr_model:
     input:
-        counts="results/{lineage}/{geo_resolution}/prepared_seq_counts.tsv",
+        counts="results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/prepared_seq_counts.tsv",
         config="config/mlr/{lineage}.yaml",
     output:
-        model="results/{lineage}/{geo_resolution}/mlr/initial_MLR_results.json",
+        model="results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/mlr/initial_MLR_results.json",
     params:
         data_name="initial_MLR",
-        path="results/{lineage}/{geo_resolution}/mlr/",
-        max_date=lambda wildcards: config["prepare_data"][wildcards.geo_resolution].get("max_date", "0D"),
+        path=subpath(output.model, parent=True),
+        max_date=config["max_date"],
     benchmark:
-        "results/{lineage}/{geo_resolution}/mlr/mlr-model_benchmark.tsv"
+        "results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/mlr/mlr-model_benchmark.tsv"
     resources:
         mem_mb=3000,
     shell:
@@ -174,29 +200,29 @@ rule mlr_model:
 
 rule add_colors_to_mlr_model:
     input:
-        model="results/{lineage}/{geo_resolution}/mlr/initial_MLR_results.json",
-        auspice_config="results/{lineage}/auspice_config.json",
+        model="results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/mlr/initial_MLR_results.json",
+        auspice_config="data/nextstrain/{lineage}/auspice_config.json",
+        color_schemes="config/color_schemes.tsv",
     output:
-        model="results/{lineage}/{geo_resolution}/mlr/MLR_results.json",
-    params:
-        coloring_field=config["coloring_field"],
+        model="results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/mlr/MLR_results.json",
     shell:
         r"""
         python scripts/add_colors_to_model.py \
             --model {input.model:q} \
             --auspice-config {input.auspice_config:q} \
-            --coloring-field {params.coloring_field:q} \
+            --color-schemes {input.color_schemes:q} \
+            --coloring-field {wildcards.variant_classification:q} \
             --output {output.model:q}
         """
 
 rule parse_mlr_json:
     input:
-        model="results/{lineage}/{geo_resolution}/mlr/MLR_results.json",
+        model="results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/mlr/MLR_results.json",
     output:
-        ga="results/{lineage}/{geo_resolution}/mlr/ga.tsv",
-        freq="results/{lineage}/{geo_resolution}/mlr/freq.tsv",
-        emp="results/{lineage}/{geo_resolution}/mlr/raw_freq.tsv",
-        freq_forecast="results/{lineage}/{geo_resolution}/mlr/freq_forecast.tsv",
+        ga="results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/mlr/ga.tsv",
+        freq="results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/mlr/freq.tsv",
+        emp="results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/mlr/raw_freq.tsv",
+        freq_forecast="results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/mlr/freq_forecast.tsv",
     params:
         version="MLR",
     shell:
@@ -214,7 +240,7 @@ rule get_pivot:
     input:
         config="config/mlr/{lineage}.yaml"
     output:
-        "results/{lineage}/{geo_resolution}/pivot.txt"
+        "results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/pivot.txt"
     shell:
         """
         python3 scripts/get_pivot.py \
@@ -224,7 +250,7 @@ rule get_pivot:
 
 rule download_auspice_config_json:
     output:
-        config="results/{lineage}/auspice_config.json",
+        config="data/nextstrain/{lineage}/auspice_config.json",
     shell:
         """
         curl \
@@ -235,14 +261,12 @@ rule download_auspice_config_json:
 
 rule plot_freq:
     input:
-        freq_data="results/{lineage}/{geo_resolution}/mlr/freq.tsv",
-        raw_data="results/{lineage}/{geo_resolution}/mlr/raw_freq.tsv",
+        freq_data="results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/mlr/freq.tsv",
+        raw_data="results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/mlr/raw_freq.tsv",
         color_scheme="config/color_schemes.tsv",
-        auspice_config="results/{lineage}/auspice_config.json"
+        auspice_config="data/nextstrain/{lineage}/auspice_config.json",
     output:
-        variant="plots/{lineage}/{geo_resolution}/freq/freq_by_location.png"
-    params:
-        coloring_field=config["coloring_field"],
+        variant="plots/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/freq/freq_by_location.png",
     shell:
         """
         python3 ./scripts/plot-freq.py \
@@ -250,21 +274,19 @@ rule plot_freq:
             --input_raw {input.raw_data} \
             --colors {input.color_scheme} \
             --auspice-config {input.auspice_config} \
-            --coloring-field {params.coloring_field} \
+            --coloring-field {wildcards.variant_classification} \
             --output {output.variant}
         """
 
 rule plot_ga:
     input:
-        ga="results/{lineage}/{geo_resolution}/mlr/ga.tsv",
+        ga="results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/mlr/ga.tsv",
         color_scheme="config/color_schemes.tsv",
-        pivot="results/{lineage}/{geo_resolution}/pivot.txt",
-        auspice_config="results/{lineage}/auspice_config.json"
+        pivot="results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/pivot.txt",
+        auspice_config="data/nextstrain/{lineage}/auspice_config.json"
     output:
-        variant="plots/{lineage}/{geo_resolution}/ga/ga_by_variant.png",
-        location="plots/{lineage}/{geo_resolution}/ga/ga_by_location.png",
-    params:
-        coloring_field=config["coloring_field"],
+        variant="plots/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/ga/ga_by_variant.png",
+        location="plots/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/ga/ga_by_location.png",
     shell:
         """
         python3 ./scripts/plot-ga.py \
@@ -275,7 +297,7 @@ rule plot_ga:
             --out_location {output.location} \
             --pivot {input.pivot} \
             --auspice-config {input.auspice_config} \
-            --coloring-field {params.coloring_field}
+            --coloring-field {wildcards.variant_classification}
         """
 
 if config.get("s3_dst"):
diff --git a/config/defaults.yaml b/config/defaults.yaml
index 9ee49dc..9facdc1 100644
--- a/config/defaults.yaml
+++ b/config/defaults.yaml
@@ -1,13 +1,21 @@
 data:
-  h3n2:
-    s3_metadata: "s3://nextstrain-data-private/files/workflows/seasonal-flu/h3n2/metadata.tsv.xz"
-    s3_nextclade: "s3://nextstrain-data-private/files/workflows/seasonal-flu/h3n2/ha/nextclade.tsv.xz"
-  h1n1pdm:
-    s3_metadata: "s3://nextstrain-data-private/files/workflows/seasonal-flu/h1n1pdm/metadata.tsv.xz"
-    s3_nextclade: "s3://nextstrain-data-private/files/workflows/seasonal-flu/h1n1pdm/ha/nextclade.tsv.xz"
-  vic:
-    s3_metadata: "s3://nextstrain-data-private/files/workflows/seasonal-flu/vic/metadata.tsv.xz"
-    s3_nextclade: "s3://nextstrain-data-private/files/workflows/seasonal-flu/vic/ha/nextclade.tsv.xz"
+  gisaid:
+    h3n2:
+      s3_metadata: "s3://nextstrain-data-private/files/workflows/seasonal-flu/h3n2/metadata.tsv.xz"
+      s3_nextclade: "s3://nextstrain-data-private/files/workflows/seasonal-flu/h3n2/ha/nextclade.tsv.xz"
+    h1n1pdm:
+      s3_metadata: "s3://nextstrain-data-private/files/workflows/seasonal-flu/h1n1pdm/metadata.tsv.xz"
+      s3_nextclade: "s3://nextstrain-data-private/files/workflows/seasonal-flu/h1n1pdm/ha/nextclade.tsv.xz"
+    vic:
+      s3_metadata: "s3://nextstrain-data-private/files/workflows/seasonal-flu/vic/metadata.tsv.xz"
+      s3_nextclade: "s3://nextstrain-data-private/files/workflows/seasonal-flu/vic/ha/nextclade.tsv.xz"
+
+data_provenances:
+  - gisaid
+
+variant_classifications:
+  - emerging_haplotype
+  - aa_haplotype
 
 lineages:
   - h1n1pdm
@@ -18,18 +26,25 @@ geo_resolutions:
   - country
   - region
 
+min_date: "6M"
+max_date: "0D"
+
 prepare_data:
-  country:
-    min_date: "6M"
-    max_date: "0D"
-    location_min_seq: 150
-    clade_min_seq: 30
-  region:
-    min_date: "6M"
-    max_date: "0D"
-    location_min_seq: 150
-    clade_min_seq: 30
+  gisaid:
+    emerging_haplotype:
+      country:
+        location_min_seq: 150
+        clade_min_seq: 30
+      region:
+        location_min_seq: 150
+        clade_min_seq: 30
+    aa_haplotype:
+      country:
+        location_min_seq: 150
+        clade_min_seq: 30
+      region:
+        location_min_seq: 150
+        clade_min_seq: 30
 
 haplotype_variant_column: "clade"
-variant: "haplotype"
-coloring_field: "emerging_haplotype"
+mutations_column: "founderMuts['clade'].aaSubstitutions"
diff --git a/scripts/add_colors_to_model.py b/scripts/add_colors_to_model.py
index 50d0fb9..967a5af 100644
--- a/scripts/add_colors_to_model.py
+++ b/scripts/add_colors_to_model.py
@@ -8,11 +8,16 @@
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument("--model", required=True, help="JSON file of model to add colors to")
     parser.add_argument("--auspice-config", required=True, help="Auspice config JSON with a color scale to use to color variants in the given model")
+    parser.add_argument("--color-schemes", required=True, help="file with color schemes with N tab-delimited colors on row N")
     parser.add_argument("--coloring-field", required=True, help="name of the coloring field in the given Auspice config JSON where the color scale is stored")
     parser.add_argument("--output", required=True, help="JSON file of model with colors added")
 
     args = parser.parse_args()
 
+    # Load the model JSON.
+    with open(args.model, "r", encoding="utf-8") as fh:
+        model = json.load(fh)
+
     # Get color scale from Auspice config.
     with open(args.auspice_config, "r", encoding="utf-8") as fh:
         auspice_config = json.load(fh)
@@ -23,20 +28,33 @@
             color_scale = coloring["scale"]
             break
 
+    # Assign colors from color schemes, if no scale is defined in the Auspice
+    # config for the requested coloring field.
     if color_scale is None:
+        # We need one color per named variant excluding the "other" label.
+        variants = [variant for variant in model["metadata"]["variants"] if variant != "other"]
+        n_colors = len(variants)
+
+        # Load the required number of colors.
+        with open(args.color_schemes, "r", encoding="utf-8") as fh:
+            for line in fh:
+                if len(colors := line.strip().split("\t")) == n_colors:
+                    color_scale = [
+                        [variant, color]
+                        for variant, color in zip(variants, colors)
+                    ]
+                    break
+
+    if color_scale:
+        # Add the color scale to the model JSON.
+        model["metadata"]["variantColors"] = color_scale
+    else:
         print(
-            f"ERROR: Could not find a color scale for the field '{args.coloring_field}' in the given Auspice config JSON.",
+            f"ERROR: Could not find a color scale for the field '{args.coloring_field}' in the given Auspice config JSON or assign enough colors from the color schemes file.",
             file=sys.stderr,
         )
         sys.exit(1)
 
-    # Load the model JSON.
-    with open(args.model, "r", encoding="utf-8") as fh:
-        model = json.load(fh)
-
-    # Add the color scale to the model JSON.
-    model["metadata"]["variantColors"] = color_scale
-
     # Save the modified model JSON.
     with open(args.output, "w", encoding="utf-8") as oh:
         json.dump(model, oh)
diff --git a/scripts/assign_aa_haplotypes.py b/scripts/assign_aa_haplotypes.py
new file mode 100644
index 0000000..ac9041e
--- /dev/null
+++ b/scripts/assign_aa_haplotypes.py
@@ -0,0 +1,104 @@
+"""Annotate a Nextclade-style TSV with an amino acid haplotype per records using each record's clade and amino acid
+substitutions relative to the clade.
+"""
+import argparse
+import json
+import pandas as pd
+
+
+def create_haplotype_for_record(record, clade_column, mutations_column, genes=None, strip_genes=False, sites_by_gene=None):
+    """Create a haplotype string for the given record based on the values in its
+    clade and mutations column. If a list of genes is given, filter mutations to
+    only those in the requested genes.
+
+    """
+    clade = record[clade_column]
+
+    if record[mutations_column] == "":
+        return clade
+
+    mutations = record[mutations_column].split(",")
+
+    # Filter mutations to requested genes.
+    if sites_by_gene is not None:
+        filtered_mutations = []
+        for mutation in mutations:
+            # mutation looks like "HA1:N145K"
+            gene, allele = mutation.split(":")
+            position = allele[1:-1]
+            if gene in sites_by_gene and position in sites_by_gene[gene]:
+                filtered_mutations.append(mutation)
+
+        mutations = filtered_mutations
+    elif genes is not None:
+        mutations = [
+            mutation
+            for mutation in mutations
+            if mutation.split(":")[0] in genes
+        ]
+
+    mutations = "-".join(mutations).replace(":", "-")
+
+    if mutations:
+        if strip_genes and genes is not None:
+            for gene in genes:
+                mutations = mutations.replace(f"{gene}-", "")
+
+        return f"{clade}:{mutations}"
+    else:
+        return clade
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument("--nextclade", required=True, help="TSV file of Nextclade annotations with columns for clade and AA mutations derived from clade")
+    parser.add_argument("--clade-column", help="name of the branch attribute for clade labels in the given Nextclade annotations", default="subclade")
+    parser.add_argument("--mutations-column", help="name of the attribute for mutations relative to clades in the given Nextclade annotations", default="founderMuts['subclade'].aaSubstitutions")
+    parser.add_argument("--genes", nargs="+", help="list of genes to filter mutations to. If not provided, all mutations will be used.")
+    parser.add_argument("--distance-map", help="distance map JSON of genes and positions to include in haplotypes")
+    parser.add_argument("--strip-genes", action="store_true", help="strip gene names from coordinates in output haplotypes")
+    parser.add_argument("--attribute-name", default="haplotype", help="name of attribute to store the derived haplotype in the output file")
+    parser.add_argument("--output", help="TSV file of Nextclade annotations with derived haplotype column added", required=True)
+    args = parser.parse_args()
+
+    # Load Nextclade annotations.
+    df = pd.read_csv(
+        args.nextclade,
+        sep="\t",
+        dtype={
+            args.clade_column: "str",
+            args.mutations_column: "str",
+        },
+        na_filter=False,
+    )
+
+    # Load distance map.
+    sites_by_gene = None
+    if args.distance_map:
+        with open(args.distance_map, "r", encoding="utf-8") as fh:
+            distance_map = json.load(fh)
+            sites_by_gene = distance_map["map"]
+
+    # Annotate derived haplotypes.
+    df[args.attribute_name] = df.apply(
+        lambda record: create_haplotype_for_record(
+            record,
+            args.clade_column,
+            args.mutations_column,
+            args.genes,
+            args.strip_genes,
+            sites_by_gene,
+        ),
+        axis=1
+    )
+
+    # Save updated Nextclade annotations
+    df.to_csv(
+        args.output,
+        sep="\t",
+        index=False,
+    )
diff --git a/workflow/upload.smk b/workflow/upload.smk
index c086c80..0ea6616 100644
--- a/workflow/upload.smk
+++ b/workflow/upload.smk
@@ -10,9 +10,9 @@ def _get_s3_url(w, input_file):
 
 rule copy_latest_model_results_to_dated:
     input:
-        latest_model_results = "results/{lineage}/{geo_resolution}/mlr/MLR_results.json",
+        latest_model_results="results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/mlr/MLR_results.json",
     output:
-        dated_model_results = "results/{lineage}/{geo_resolution}/mlr/{date}_MLR_results.json",
+        dated_model_results="results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/mlr/{date}_MLR_results.json",
     shell:
         """
         cp {input.latest_model_results} {output.dated_model_results}
@@ -20,9 +20,9 @@ rule copy_latest_model_results_to_dated:
 
 rule upload_dated_model_results_to_s3:
     input:
-        model_results = "results/{lineage}/{geo_resolution}/mlr/{date}_MLR_results.json"
+        model_results="results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/mlr/{date}_MLR_results.json",
     output:
-        touch("results/{lineage}/{geo_resolution}/mlr/{date}_results_s3_upload.done")
+        touch("results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/mlr/{date}_results_s3_upload.done")
     params:
         s3_url=lambda w, input: _get_s3_url(w, input.model_results),
     shell:
@@ -35,9 +35,9 @@ rule upload_dated_model_results_to_s3:
 
 rule upload_latest_model_results_to_s3:
     input:
-        model_results = "results/{lineage}/{geo_resolution}/mlr/MLR_results.json"
+        model_results="results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/mlr/MLR_results.json"
     output:
-        touch("results/{lineage}/{geo_resolution}/mlr/results_s3_upload.done")
+        touch("results/{data_provenance}/{variant_classification}/{lineage}/{geo_resolution}/mlr/results_s3_upload.done")
     params:
         s3_url=lambda w, input: _get_s3_url(w, input.model_results),
     shell:

From d12bfcd207363a2c6f7210b60d612b76f9dc9300 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Mon, 15 Dec 2025 15:47:09 -0800
Subject: [PATCH 2/4] Lower threshold for location to be included

More regions or countries are close to the original 150-sequence
threshold but don't get included. Lowering the threshold allows more
regions to be included while keeping the same minimum for clade
inclusion.
---
 config/defaults.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/config/defaults.yaml b/config/defaults.yaml
index 9facdc1..88a192d 100644
--- a/config/defaults.yaml
+++ b/config/defaults.yaml
@@ -33,17 +33,17 @@ prepare_data:
   gisaid:
     emerging_haplotype:
       country:
-        location_min_seq: 150
+        location_min_seq: 100
         clade_min_seq: 30
       region:
-        location_min_seq: 150
+        location_min_seq: 100
         clade_min_seq: 30
     aa_haplotype:
       country:
-        location_min_seq: 150
+        location_min_seq: 100
         clade_min_seq: 30
       region:
-        location_min_seq: 150
+        location_min_seq: 100
         clade_min_seq: 30
 
 haplotype_variant_column: "clade"

From 556f0423a6d52f1935937e184e89c14e0a5de597 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Mon, 22 Dec 2025 13:53:53 -0800
Subject: [PATCH 3/4] Display multiple variant classifications

Update forecasts viz interface to display MLR results for multiple
variant classifications on a single page per subtype and geographic
resolution, following the pattern from forecasts-ncov [1].

Since the `useModelData` functions lives outside of this repo and
expects a single object with a `modelUrl` attribute, this commit creates
two separate config objects with one per variant classification. We call
the model URL function once per config after the initial config object
is copied for each variant classification. This function now accepts
arguments for the variant classification (to get the correct S3 URL) and
for model date (so we only need the date-based update logic in one
place).

As I implemented this expanded version of the display, I found it
simpler to write the panel titles (the `h2` tags) and descriptions right
in the HTML instead of maintaining a separate title per variant
classification, subtype, and geographic resolution. This new
implementation mimics the forecasts-ncov at the expense of some
flexibility in defining different titles and descriptions per
subtype (which we have never needed to do).

[1] https://github.com/nextstrain/forecasts-ncov/blob/940791b/viz/src/App.jsx

Co-authored-by: james hadfield <hadfield.james@gmail.com>
---
 viz/src/main.jsx | 77 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 56 insertions(+), 21 deletions(-)

diff --git a/viz/src/main.jsx b/viz/src/main.jsx
index 6f34a90..a141fa1 100644
--- a/viz/src/main.jsx
+++ b/viz/src/main.jsx
@@ -1,4 +1,4 @@
-import React from 'react'
+import React, {useMemo} from 'react'
 import ReactDOM from 'react-dom/client'
 import '@nextstrain/evofr-viz/dist/index.css';
 import { PanelDisplay, useModelData } from '@nextstrain/evofr-viz';
@@ -29,10 +29,7 @@ const TABS = {
 }
 for (const [key,info] of Object.entries(TABS)) {
   info.modelName = key;
-  info.modelUrl = modelUrl(key);
   info.frequency = {
-    title: `Frequencies for ${info.displayName}`,
-    description: "",
     params: {
       preset: "frequency",
       rawDataToggleName: "Raw Data",
@@ -40,8 +37,6 @@ for (const [key,info] of Object.entries(TABS)) {
     },
   };
   info.growthAdvantage = {
-    title: `Growth advantage for ${info.displayName}`,
-    description: "",
     params: {preset: "growthAdvantage"},
   };
   info.sites = {
@@ -73,8 +68,14 @@ function getModelDate() {
   return (new URLSearchParams(window.location.search)).get('date');
 }
 
-function modelUrl(subtypeResolution) {
-  return `https://data.nextstrain.org/files/workflows/forecasts-flu/${subtypeResolution}/mlr/MLR_results.json`;
+function modelUrl(variantClassification, subtypeResolution, modelDate) {
+  let url = `https://data.nextstrain.org/files/workflows/forecasts-flu/gisaid/${variantClassification}/${subtypeResolution}/mlr/MLR_results.json`;
+
+  if (modelDate) {
+    url = url.replace(/([^/]+)$/, `${modelDate}_MLR_results.json`);
+  }
+
+  return url;
 }
 
 /**
@@ -98,15 +99,25 @@ function filterLocations(model, hierarchical=true) {
 
 function App() {
 
-  const [tabSelected, setTabSelected] = React.useState(getStartingTab)
-  const config = TABS[tabSelected];
   let modelDate = getModelDate();
-  if (modelDate) {
-    config.modelUrl = config.modelUrl.replace(/([^/]+)$/, `${modelDate}_MLR_results.json`)
-  }
+  const [tabSelected, setTabSelected] = React.useState(getStartingTab);
+
+  /* configuration for the viz depends on the tab selected. Wrap this in
+     `useMemo` so that we only reconstruct it when the tab changes. Because the
+     data fetching (via `useModelData`) is triggered each time the config
+     changes we need to avoid recreating this config object else we get into an
+     infinite loop of data fetches!
+    */
+  const config = useMemo(() => {
+    return {
+      emergingHaplotype: Object.assign({}, TABS[tabSelected], {modelUrl: modelUrl("emerging_haplotype", tabSelected, modelDate)}),
+      aaHaplotype:       Object.assign({}, TABS[tabSelected], {modelUrl: modelUrl("aa_haplotype",       tabSelected, modelDate)}),
+    }
+  }, [tabSelected])
 
   // The `useModelData` hook downloads & parses the config-defined JSON
-  const model = useModelData(config)
+  const modelEmergingHaplotype = useModelData(config.emergingHaplotype);
+  const modelAAHaplotype = useModelData(config.aaHaplotype);
 
   function changeTab(key) {
     setTabSelected(key);
@@ -129,26 +140,50 @@ function App() {
 
       <div className="panelsContainer" key={tabSelected}>
 
-        {config.frequency && (
+        {config.emergingHaplotype.frequency && (
+          <>
+            <h2>Emerging haplotype frequencies for {TABS[tabSelected].displayName}</h2>
+              <p>
+                Updated {modelEmergingHaplotype?.modelData?.get('updated') || 'loading'}.
+              </p>
+              <div className="frequencies panelDisplay">
+                <PanelDisplay data={modelEmergingHaplotype} params={config.emergingHaplotype.frequency.params} locations={filterLocations(modelEmergingHaplotype, false)}/>
+              </div>
+          </>
+        )}
+
+        {TABS[tabSelected].growthAdvantage && (
+          <>
+            <h2>Emerging haplotype growth advantages for {TABS[tabSelected].displayName}</h2>
+              <p>
+                Updated {modelEmergingHaplotype?.modelData?.get('updated') || 'loading'}.
+              </p>
+              <div className="panelDisplay">
+                <PanelDisplay data={modelEmergingHaplotype} params={config.emergingHaplotype.growthAdvantage.params} locations={filterLocations(modelEmergingHaplotype)}/>
+              </div>
+          </>
+        )}
+
+        {config.aaHaplotype.frequency && (
           <>
-              <h2>{config.frequency.title}</h2>
+            <h2>Amino acid haplotype frequencies for {TABS[tabSelected].displayName}</h2>
               <p>
-                {config.frequency.description}. Updated {model?.modelData?.get('updated') || 'loading'}.
+                Updated {modelAAHaplotype?.modelData?.get('updated') || 'loading'}.
               </p>
               <div className="frequencies panelDisplay">
-                <PanelDisplay data={model} params={config.frequency.params} locations={filterLocations(model, false)}/>
+                <PanelDisplay data={modelAAHaplotype} params={config.aaHaplotype.frequency.params} locations={filterLocations(modelAAHaplotype, false)}/>
               </div>
           </>
         )}
 
         {TABS[tabSelected].growthAdvantage && (
           <>
-              <h2>{config.growthAdvantage.title}</h2>
+              <h2>Amino acid haplotype growth advantages for {TABS[tabSelected].displayName}</h2>
               <p>
-                {config.growthAdvantage.description}. Updated {model?.modelData?.get('updated') || 'loading'}.
+                Updated {modelAAHaplotype?.modelData?.get('updated') || 'loading'}.
               </p>
               <div className="panelDisplay">
-                <PanelDisplay data={model} params={config.growthAdvantage.params} locations={filterLocations(model)}/>
+                <PanelDisplay data={modelAAHaplotype} params={config.aaHaplotype.growthAdvantage.params} locations={filterLocations(modelAAHaplotype)}/>
               </div>
           </>
         )}

From b25557b2dfab43e32f7b4b6941eac13682907c0f Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Tue, 23 Dec 2025 10:19:10 -0800
Subject: [PATCH 4/4] Update change log

---
 CHANGES.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGES.md b/CHANGES.md
index 93bd95a..8bf3c48 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -6,6 +6,10 @@ This changelog is intended for _humans_ and follows many of the principles from
 Changes for this project _do not_ currently follow the [Semantic Versioning rules](https://semver.org/spec/v2.0.0.html).
 Instead, changes appear below grouped by the date they were added to the workflow.
 
+# 23 December 2025
+
+ - Add frequency and fitness estimates for amino acid haplotypes for each subtype and geographic resolution. See [#26](https://github.com/nextstrain/forecasts-flu/pull/26) for details.
+
 # 4 August 2025
 
  - Enable forecasts for MLR models using a forecast horizon of 6 steps with a frequency estimation interval of 14-days to produce 84-day forecasts. See [#18](https://github.com/nextstrain/forecasts-flu/pull/18) for details.