From 2aa432c65e41a2ab33c79069cae0b825d57e1aba Mon Sep 17 00:00:00 2001 From: matt-sd-watson Date: Wed, 27 Apr 2022 15:26:11 -0400 Subject: [PATCH 01/13] Fix filter to remove identical names --- .gitignore | 2 ++ outbreaker/workflows/outbreaker_summary_report.Rmd | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index f9564f6..33c5779 100644 --- a/.gitignore +++ b/.gitignore @@ -130,3 +130,5 @@ dmypy.json .DS_Store + +.snakemake/ diff --git a/outbreaker/workflows/outbreaker_summary_report.Rmd b/outbreaker/workflows/outbreaker_summary_report.Rmd index 5f298d7..2ddd436 100644 --- a/outbreaker/workflows/outbreaker_summary_report.Rmd +++ b/outbreaker/workflows/outbreaker_summary_report.Rmd @@ -249,7 +249,8 @@ distances <- read.csv(params$snp_dists, header = FALSE, na.strings=c("","NA"), stringsAsFactors=FALSE, sep=",") %>% filter(! grepl("MN908947", V1) & - ! grepl("MN908947", V2)) + ! grepl("MN908947", V2)) %>% + filter(V1 != V2) filtered_w_background <- subset(distances, V1 %in% subset(tr.df.labs, category == "Focal_Sequence")$label & ! V2 %in% subset(tr.df.labs, category == "Focal_Sequence")$label) From 163c31b0aefcf38b6151f7541b8c2e4a2c605e3c Mon Sep 17 00:00:00 2001 From: matt-sd-watson Date: Thu, 28 Apr 2022 10:51:34 -0400 Subject: [PATCH 02/13] Fix rename without names csv w report --- outbreaker/workflows/outbreaker.smk | 23 +++++++++------- .../workflows/outbreaker_summary_report.Rmd | 27 +++++++++++++------ 2 files changed, 32 insertions(+), 18 deletions(-) diff --git a/outbreaker/workflows/outbreaker.smk b/outbreaker/workflows/outbreaker.smk index 43b1a92..6cf2f2c 100644 --- a/outbreaker/workflows/outbreaker.smk +++ b/outbreaker/workflows/outbreaker.smk @@ -1,6 +1,7 @@ import os import sys import click +import pandas as pd if not config["outdir"]: config["outdir"] = os.getcwd() + "/outbreaker/" @@ -21,6 +22,7 @@ rule all: os.path.join(config["outdir"], config["prefix"] + ".fa"), os.path.join(config["outdir"], config["prefix"] + "_filtered.fa") if config["filter"] else [], os.path.join(config["outdir"], config["prefix"] + "_renamed.fa") if config["rename"] else [], + os.path.join(config["outdir"], config["prefix"] + "_rename_matches.csv") if config["rename"] and not config["names_csv"] else [], os.path.join(config["outdir"], config["prefix"] + "_aln.fasta"), os.path.join(config["outdir"], config["prefix"] + "_snipit.jpg"), os.path.join(config["outdir"], config["prefix"]+ "_tree.nwk"), @@ -134,7 +136,8 @@ rule rename_headers: fasta = rules.create_subset.output.sub_fasta, names_csv = config["names_csv"] if config["names_csv"] else [] output: - renamed = os.path.join(config["outdir"], config["prefix"] + "_renamed.fa") + renamed = os.path.join(config["outdir"], config["prefix"] + "_renamed.fa"), + names_matches = os.path.join(config["outdir"], config["prefix"] + "_rename_matches.csv") if not config["names_csv"] else [] run: if config["rename"]: if config["names_csv"]: @@ -146,19 +149,21 @@ rule rename_headers: else: fasta_to_open = open(input.fasta) newfasta = open(output.renamed, 'w') + names_matches = {} + name_counter = 1 for line in fasta_to_open: if line.startswith('>'): line_cleaned = line.strip('>').strip() - try: - replacement_name = "ON-PHL-" + line_cleaned.split("PHLON")[1].split("-SARS")[0] + "-" + line_cleaned.split("PHLON")[1].split("-SARS")[1] - except IndexError: - replacement_name = line_cleaned + replacement_name = config["prefix"] + "_" + str(name_counter) newfasta.write(">" + replacement_name + "\n") + names_matches[line_cleaned] = replacement_name + name_counter += 1 else: newfasta.write(line) fasta_to_open.close() newfasta.close() + pd.DataFrame(names_matches.items(), columns=['original_name', 'new_name']).to_csv(output.names_matches, index = False) sys.stderr.write(f'\nrenamed multi-FASTA headers into: {output.renamed}\n') @@ -321,15 +326,13 @@ rule summary_report: renamed = convertPythonBooleanToR(config["rename"]), names_sheet_read = absol_path(config["names_csv"]) if config["names_csv"] else [], prefix_input = str(config["prefix"]), - report_output = absol_path(os.path.join(config["outdir"])) + "/" + report_output = absol_path(os.path.join(config["outdir"])) + "/", + name_matches = absol_path(os.path.join(config["outdir"], config["prefix"] + "_rename_matches.csv")) if config["rename"] and not config["names_csv"] else [] run: if config["report"]: shell( """ - Rscript -e \"rmarkdown::render(input = '{params.script}', params = list(focal_list = '{params.focal_read}', background_list = '{params.background_read}', snp_dists = '{params.snp_read}', snp_tree = '{params.snp_tree_read}', full_tree = '{params.full_tree_read}', snipit = '{params.snipit_read}', renamed = '{params.renamed}', names_csv = '{params.names_sheet_read}', outbreak_prefix = '{params.prefix_input}', outbreak_directory = '{params.report_output}'), output_file = '{params.output}')\" + Rscript -e \"rmarkdown::render(input = '{params.script}', params = list(focal_list = '{params.focal_read}', background_list = '{params.background_read}', snp_dists = '{params.snp_read}', snp_tree = '{params.snp_tree_read}', full_tree = '{params.full_tree_read}', snipit = '{params.snipit_read}', renamed = '{params.renamed}', names_csv = '{params.names_sheet_read}', outbreak_prefix = '{params.prefix_input}', outbreak_directory = '{params.report_output}', name_matches = '{params.name_matches}'), output_file = '{params.output}')\" """) - - - diff --git a/outbreaker/workflows/outbreaker_summary_report.Rmd b/outbreaker/workflows/outbreaker_summary_report.Rmd index 2ddd436..668121d 100644 --- a/outbreaker/workflows/outbreaker_summary_report.Rmd +++ b/outbreaker/workflows/outbreaker_summary_report.Rmd @@ -29,6 +29,9 @@ params: value: "" outbreak_directory: value: "" + name_matches: + input: file + value: "" output: html_document: toc: yes @@ -108,9 +111,10 @@ if (file_ext(params$focal_list) %in% fasta_extensions) { ```{r, echo=F, warning=F, message=F} if (params$renamed == "TRUE" & params$names_csv == "") { + + rename_matches <- read.csv(params$name_matches) + new_focal_names <- as.vector(subset(rename_matches, original_name %in% focal_input$Sequence)$new_name) - new_focal_names <- as.vector(paste("ON-PHL", str_split_fixed(focal_input$Sequence, "PHLON|-SARS", 4)[,2], - str_split_fixed(focal_input$Sequence, "PHLON|-SARS", 4)[,3], sep = "-")) } else if (params$renamed == "TRUE" & params$names_csv != "") { renaming_sheet <- read.csv(params$names_csv, header = T, na.strings=c("","NA"), @@ -250,16 +254,23 @@ distances <- read.csv(params$snp_dists, header = FALSE, stringsAsFactors=FALSE, sep=",") %>% filter(! grepl("MN908947", V1) & ! grepl("MN908947", V2)) %>% - filter(V1 != V2) + filter(V1 != V2) -filtered_w_background <- subset(distances, V1 %in% subset(tr.df.labs, category == "Focal_Sequence")$label & - ! V2 %in% subset(tr.df.labs, category == "Focal_Sequence")$label) +filtered_w_background <- subset(distances, V1 %in% as.vector(subset(tr.df.labs, category == "Focal_Sequence")$label) & + ! V2 %in% as.vector(subset(tr.df.labs, category == "Focal_Sequence")$label)) -filtered_only_focal <- subset(distances, V1 %in% subset(tr.df.labs, category == "Focal_Sequence")$label & - V2 %in% subset(tr.df.labs, category == "Focal_Sequence")$label) +filtered_only_focal <- subset(distances, V1 %in% as.vector(subset(tr.df.labs, category == "Focal_Sequence")$label) & + V2 %in% as.vector(subset(tr.df.labs, category == "Focal_Sequence")$label)) -distance_frame_only_focal <- as.data.frame(table(filtered_only_focal$V3)) %>% mutate(Var1 = as.numeric(as.character(Var1))) +distance_frame_only_focal <- as.data.frame(table(filtered_only_focal$V3)) + +if (nrow(distance_frame_only_focal) != 0) { + distance_frame_only_focal <- distance_frame_only_focal %>% mutate(Var1 = as.numeric(as.character(Var1))) colnames(distance_frame_only_focal) <- c("SNP_Distance", "Frequency") +} else { + distance_frame_only_focal <- data.frame(SNP_Distance = numeric(), + Frequency = numeric()) +} distance_frame_w_background <-as.data.frame(table(filtered_w_background$V3)) From 302f62e1a41f9237bd86015099518ccf504a0535 Mon Sep 17 00:00:00 2001 From: matt-sd-watson Date: Thu, 28 Apr 2022 11:42:50 -0400 Subject: [PATCH 03/13] Initial pytest --- tests/test_outbreaker.py | 46 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 tests/test_outbreaker.py diff --git a/tests/test_outbreaker.py b/tests/test_outbreaker.py new file mode 100644 index 0000000..b1f8ede --- /dev/null +++ b/tests/test_outbreaker.py @@ -0,0 +1,46 @@ +import os +from outbreaker import main +import sys +from Bio import SeqIO + +DATA_DIR = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'data/')) +print(DATA_DIR) + +test_reference = os.path.join(DATA_DIR, 'reference', 'ncov_reference.gb') + + +class TestOutbreaker: + def test_read_test_focal_fasta(self): + query_file = os.path.join(DATA_DIR, 'tests/', 'focal_seqs.fa') + assert len(list(SeqIO.parse(query_file, "fasta"))) == 4 + def test_read_test_background_fasta(self): + query_file = os.path.join(DATA_DIR, 'tests/', 'background_seqs.fa') + assert len(list(SeqIO.parse(query_file, "fasta"))) == 6 + + def test_run_outputs(self, tmp_path): + focal_seqs = os.path.join(DATA_DIR, 'tests/', 'focal_seqs.fa') + background_seqs = os.path.join(DATA_DIR, 'tests/', 'background_seqs.fa') + + args = ['-f', str(focal_seqs), '-b', str(background_seqs), '--rename', '-p', 'pytest', + '-r', str(test_reference), '-o', str(tmp_path)] + + main.main(sysargs = args) + output_merged_fasta = os.path.join(tmp_path, 'pytest_renamed.fa') + assert len(list(SeqIO.parse(output_merged_fasta, "fasta"))) == 10 + + new_names = ["pytest_" + str(i) for i in range(1, 11, 1)] + names_in_fasta = [] + for record in SeqIO.parse(output_merged_fasta, "fasta"): + names_in_fasta.append(record.id) + assert names_in_fasta == new_names + + + + + + + + + + + From 3a06af9084decebf5406dd66cf9a13627c1fc5a7 Mon Sep 17 00:00:00 2001 From: matt-sd-watson Date: Thu, 28 Apr 2022 13:11:30 -0400 Subject: [PATCH 04/13] Add pytests + github actions --- .github/workflows/main.yml | 3 +++ tests/test_outbreaker.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index cd0fd92..3263e26 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -32,3 +32,6 @@ jobs: - name: Run outbreaker test via config shell: bash -l {0} run: outbreaker -c data/test_config.yaml + - name: Run pytest for outbreaker + shell: bash -l {0} + run: pytest tests/ diff --git a/tests/test_outbreaker.py b/tests/test_outbreaker.py index b1f8ede..68cbc64 100644 --- a/tests/test_outbreaker.py +++ b/tests/test_outbreaker.py @@ -35,6 +35,36 @@ def test_run_outputs(self, tmp_path): assert names_in_fasta == new_names + def test_run_with_missing_names_csv(self, tmp_path): + focal_seqs = os.path.join(DATA_DIR, 'tests/', 'focal_seqs.fa') + background_seqs = os.path.join(DATA_DIR, 'tests/', 'background_seqs.fa') + names_csv = os.path.join(DATA_DIR, 'tests/', 'names.csv') + + args = ['-f', str(focal_seqs), '-b', str(background_seqs), '--rename', '-p', 'pytest', + '-r', str(test_reference), '-o', str(tmp_path), '--names-csv', str(names_csv)] + + main.main(sysargs=args) + + output_merged_fasta = os.path.join(tmp_path, 'pytest_renamed.fa') + names_in_fasta = [] + for record in SeqIO.parse(output_merged_fasta, "fasta"): + names_in_fasta.append(record.id) + names_not_all = ['Renamed_1', 'Renamed_2', 'Renamed_3', + 'Focal_4', 'Renamed_4', 'Renamed_5', 'Background_3', + 'Renamed_6', 'Renamed_7', 'Renamed_8'] + assert names_in_fasta == names_not_all + + output_snp_dists = os.path.join(tmp_path, "pytest_snp_dists.csv") + + with open(output_snp_dists) as f: + lines = f.readlines() + assert str('Renamed_8,Background_3,5\n') in lines + + + + + + From 4ca33260c6dd869d034eaf69f63f17b98069688a Mon Sep 17 00:00:00 2001 From: Matthew Watson Date: Thu, 28 Apr 2022 13:49:21 -0400 Subject: [PATCH 05/13] add r to channels for CI env create --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 3263e26..d5ba251 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -19,7 +19,7 @@ jobs: with: environment-file: environments/environment.yml activate-environment: ncov_outbreaker - channels: conda-forge,bioconda,defaults + channels: conda-forge,bioconda,defaults,r - name: Install outbreaker shell: bash -l {0} run: pip install -e . From 487a121e79846df844d3305716543b3a62ae17c1 Mon Sep 17 00:00:00 2001 From: Matthew Watson Date: Thu, 28 Apr 2022 14:01:26 -0400 Subject: [PATCH 06/13] add mamba and matrix for ubuntu to CI --- .github/workflows/main.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d5ba251..2028834 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -12,7 +12,11 @@ on: jobs: build: - runs-on: ubuntu-latest + name: Outbreaker test on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: ["ubuntu-latest"] steps: - uses: actions/checkout@v2 - uses: conda-incubator/setup-miniconda@v2 @@ -20,8 +24,8 @@ jobs: environment-file: environments/environment.yml activate-environment: ncov_outbreaker channels: conda-forge,bioconda,defaults,r + mamba-version: "*" - name: Install outbreaker - shell: bash -l {0} run: pip install -e . - name: Check outbreaker version shell: bash -l {0} From 31bc886d8b48e753fc6b7cd0bdadff36db58c074 Mon Sep 17 00:00:00 2001 From: Matthew Watson Date: Thu, 28 Apr 2022 14:07:29 -0400 Subject: [PATCH 07/13] try removing minimal spec for snakemake --- environments/environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/environment.yml b/environments/environment.yml index 060a9e9..21647e4 100644 --- a/environments/environment.yml +++ b/environments/environment.yml @@ -30,7 +30,7 @@ dependencies: - r-essentials - r-traminer - scipy=1.6.3 - - snakemake-minimal + - snakemake - snp-dists=0.8.2 - snp-sites=2.5.1 - vcftools=0.1.16 From 3232b8c5ce87ba9d18c2fb7526ca81019423035b Mon Sep 17 00:00:00 2001 From: Matthew Watson Date: Thu, 28 Apr 2022 14:19:01 -0400 Subject: [PATCH 08/13] try keeping older version of snakemake --- environments/environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/environment.yml b/environments/environment.yml index 21647e4..6a0923c 100644 --- a/environments/environment.yml +++ b/environments/environment.yml @@ -30,7 +30,7 @@ dependencies: - r-essentials - r-traminer - scipy=1.6.3 - - snakemake + - snakemake-minimal<=6.8.0 - snp-dists=0.8.2 - snp-sites=2.5.1 - vcftools=0.1.16 From 64d35e9ef25437de1ddfea23c689b296a4d57495 Mon Sep 17 00:00:00 2001 From: Matthew Watson Date: Thu, 28 Apr 2022 14:31:20 -0400 Subject: [PATCH 09/13] try explicit install of snakemake min with pip --- .github/workflows/main.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 2028834..8add243 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -25,6 +25,8 @@ jobs: activate-environment: ncov_outbreaker channels: conda-forge,bioconda,defaults,r mamba-version: "*" + - name: Install Python dependencies + run: python -m pip install --upgrade pip snakemake-minimal<=6.8.0 - name: Install outbreaker run: pip install -e . - name: Check outbreaker version From baf9dd0e03bcc46521f8a6fd250dc1eb27eee1cd Mon Sep 17 00:00:00 2001 From: Matthew Watson Date: Thu, 28 Apr 2022 14:39:18 -0400 Subject: [PATCH 10/13] try pip install no -e --- .github/workflows/main.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 8add243..2275d00 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -25,10 +25,8 @@ jobs: activate-environment: ncov_outbreaker channels: conda-forge,bioconda,defaults,r mamba-version: "*" - - name: Install Python dependencies - run: python -m pip install --upgrade pip snakemake-minimal<=6.8.0 - name: Install outbreaker - run: pip install -e . + run: pip install . - name: Check outbreaker version shell: bash -l {0} run: outbreaker -v From 88b474320146c69f42e6be3f107b163e29332dd9 Mon Sep 17 00:00:00 2001 From: Matthew Watson Date: Thu, 28 Apr 2022 14:51:14 -0400 Subject: [PATCH 11/13] add snakemake to setup requirements --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7642aaf..d44fd62 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ author='Matthew Watson', author_email='matthew.watson@oahpp.ca', description='snakemake and Python integrated workflow for intermediate file generation for COVID outbreak analysis', - install_requires = ["pandas>=1.1.5", "numpy>=1.19", "biopython>=1.79"], + install_requires = ["pandas>=1.1.5", "numpy>=1.19", "biopython>=1.79", "snakemake-minimal<=6.8.0"], entry_points=""" [console_scripts] {program} = outbreaker.main:main From d9f5d02f183eef9abb642afea267542286e10220 Mon Sep 17 00:00:00 2001 From: matt-sd-watson Date: Thu, 28 Apr 2022 14:57:51 -0400 Subject: [PATCH 12/13] Remove snakemake minimal --- environments/environment.yml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/environment.yml b/environments/environment.yml index 6a0923c..21647e4 100644 --- a/environments/environment.yml +++ b/environments/environment.yml @@ -30,7 +30,7 @@ dependencies: - r-essentials - r-traminer - scipy=1.6.3 - - snakemake-minimal<=6.8.0 + - snakemake - snp-dists=0.8.2 - snp-sites=2.5.1 - vcftools=0.1.16 diff --git a/setup.py b/setup.py index d44fd62..40c6a56 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ author='Matthew Watson', author_email='matthew.watson@oahpp.ca', description='snakemake and Python integrated workflow for intermediate file generation for COVID outbreak analysis', - install_requires = ["pandas>=1.1.5", "numpy>=1.19", "biopython>=1.79", "snakemake-minimal<=6.8.0"], + install_requires = ["pandas>=1.1.5", "numpy>=1.19", "biopython>=1.79", "snakemake>=7.0.0"], entry_points=""" [console_scripts] {program} = outbreaker.main:main From fb95943b6edc46beb164bbb2e0bf32940ef70917 Mon Sep 17 00:00:00 2001 From: matt-sd-watson Date: Thu, 28 Apr 2022 16:05:47 -0400 Subject: [PATCH 13/13] Bump version, changelog, docs --- CHANGELOG.md | 8 +++++++- README.md | 2 +- docs/2-INPUTS.md | 26 ++++++++++++-------------- outbreaker/__init__.py | 2 +- 4 files changed, 21 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b0e7c0..7307180 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -58,5 +58,11 @@ ## Minor Version 0.6.4, 17-02-22 - -outbreaker now retains all sequences if ```--names-csv``` is used for renaming and not all sequences are contained in the CSV + - outbreaker now retains all sequences if ```--names-csv``` is used for renaming and not all sequences are contained in the CSV - updates to the renaming behavior to be compatible with fastafurious v1.2.0 (additional warning messages) + +## Minor Version 0.6.5, 28-04-22 (Patch) + - Change behaviour of renaming when no CSV is supplied. Will now use the prefix for the run to generate new names with alphanumerical sequential order (i.e. prefix_1, prefix_2) and will output the name matches as a CSV file + - Above fix changes fixes the error in the SNP distance plot in the HTML report when rename is used but no names CSV is supplied + - Addition of pytests in the CI/CD workflow + diff --git a/README.md b/README.md index e430251..55aae9a 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,6 @@ More detailed documentation for outbreaker usage and functionality can be found ## Acknowledgments -Inspiration for code structure and design for outbreaker was inspired by [pangolin](https://github.com/cov-lineages/pangolin) and [civet](https://github.com/artic-network/civet), and minor code blocks were adopted from these software. \ +Inspiration for code structure and design for outbreaker was inspired by [pangolin](https://github.com/cov-lineages/pangolin) and [civet](https://github.com/artic-network/civet), and minor code blocks were adopted from these software. The **Background** section in the documentation describing outbreak definitions was written by Mark Horsman. diff --git a/docs/2-INPUTS.md b/docs/2-INPUTS.md index 651fe2d..31e7bf7 100644 --- a/docs/2-INPUTS.md +++ b/docs/2-INPUTS.md @@ -30,21 +30,19 @@ The following inputs are purely optional, but may augment the types of analysis ## Sample head renaming -For PHO outbreak analysis, it is common to rename a sample COVID-19 sequence with a different alias for privacy purposes, especially if the outbreak analysis is to be shared with external collaborators. A typical renaming scheme for PHO COVID-19 samples would follow the following pattern: \ -Original sample name: PHLON20-SARS##### or PHLON22-SARS##### -New sample name: ON-PHL-20-##### or ON-PHL-21-##### \ -where ##### denotes the specific WGS Id that is used to track the genomic sequence within the PHO laboratory. +It is common to rename a sample COVID-19 sequence with a different alias for privacy purposes, especially if the outbreak analysis is to be shared with external collaborators. \ outbreaker is designed to facilitate the renaming of FASTA headers to accommodate privacy guidelines and/or to use different label aliases for the outbreak. This feature can be toggled on using ```--rename```. There are two different renaming possibilities for user when ```--rename``` is enabled: \ - • **Option 1**: The workflow will auto-detect any FASTA headers that have the format PHLON{20,21}-SARS##### and change them to ON-PHL-{20,21}-#####. If the FASTA header does not follow this format, it will be left as is (i.e. Gisaid sample headers that follow a different format, or external samples) \ - • **Option 2**: A CSV file of FASTA labels can be supplied using --names_csv. This requires that ALL focal and background samples be included in the table. The contents of the table should have the following scheme as an example: -original_name -new_name -PHLON21-SARS29115 -sequence_1 -PHLON21-SARS15665 -sequence_2 -This table will allow outbreaker to rename the above PHLON sequences with sequence_# headers in all downstream input files generated by the workflow. -If ```--names_csv```, the CSV headers must have original_name for the current/original header name, and new_name for the target/output name to run properly. + • **Option 1**: outbreaker will use the run prefix supplied at runtime to create new alias for each sample. In an example, for a run with 10 samples with run prefix "apartment_can", The new sample names will range from apartment_can_1 to apartment_can_10. A CSV matching the original and newly generated names will be added to the output directory. \ + • **Option 2**: A CSV file of FASTA labels can be supplied using --names_csv. This allows for custom labels for specific samples. Note that not all samples need to have a new name in this CSV. If a sample does not have a coresponding new name, it is left as is as of outbreaker v0.6.4. +The format of this CSV should be as follows: +``` +original_name new_name +PHLON21-SARS29115 sequence_1 +PHLON21-SARS15665 sequence_2 +``` + +This table will allow outbreaker to use fastafurious to rename the above PHLON sequences with sequence_# headers in all downstream input files generated by the workflow. \ +If ```--names_csv``` is supplied, the CSV headers must have original_name for the current/original header name, and new_name for the target/output name to run properly. ## Optional argument descriptions diff --git a/outbreaker/__init__.py b/outbreaker/__init__.py index 7b17ff8..cf91f7a 100644 --- a/outbreaker/__init__.py +++ b/outbreaker/__init__.py @@ -1,2 +1,2 @@ _program = "outbreaker" -__version__ = "0.6.4" +__version__ = "0.6.5"