Skip to content

Commit da21567

Browse files
committed
Latest changes before releasing v1.0.1
- Added option to `align` step to ``--collect_only` the extracted markers and exit -
1 parent c0b66f2 commit da21567

File tree

3 files changed

+69
-33
lines changed

3 files changed

+69
-33
lines changed

captus/align.py

+58-32
Original file line numberDiff line numberDiff line change
@@ -172,9 +172,14 @@ def align(full_command, args):
172172

173173
collect_extracted_markers(markers, formats, args.max_paralogs, args.min_samples,
174174
extracted_sample_dirs, out_dir, settings.ALN_DIRS["UNAL"],
175-
refs_paths, args.overwrite, show_less)
175+
refs_paths, args.collect_only, args.overwrite, show_less)
176176
log.log("")
177177

178+
if args.collect_only:
179+
successful_exit(
180+
"Captus-assembly: ALIGN -> successfully completed, '--collect_only' was enabled"
181+
f" [{elapsed_time(time.time() - captus_start)}]"
182+
)
178183

179184

180185
################################################################################################
@@ -858,7 +863,7 @@ def make_output_dirtree(markers, formats, out_dir, base_dir, margin):
858863

859864
def collect_extracted_markers(
860865
markers, formats, max_paralogs, min_samples, extracted_sample_dirs, out_dir, base_dir,
861-
refs_paths, overwrite, show_less
866+
refs_paths, collect_only, overwrite, show_less
862867
):
863868
source_files = [Path(settings.MARKER_DIRS[m], f"{m}{settings.FORMAT_SUFFIXES[f]}")
864869
for m in markers.split(",") for f in formats.split(",")
@@ -894,20 +899,24 @@ def collect_extracted_markers(
894899

895900
# Collect markers per sample's source FASTAs into a dictionary that can be updated in parallel
896901
fastas_per_marker = {}
897-
collect_marker_names_params = []
902+
collect_sample_markers_params = []
898903
for source_fasta_path in fastas_per_sample:
899-
collect_marker_names_params.append([
904+
collect_sample_markers_params.append([
900905
fastas_per_marker,
901906
source_fasta_path,
902907
fastas_per_sample[source_fasta_path]["sample_name"],
903908
fastas_per_sample[source_fasta_path]["destination"],
904909
fastas_per_sample[source_fasta_path]["suffix"],
905910
max_paralogs,
906911
])
907-
tqdm_serial_run(collect_sample_markers, collect_marker_names_params,
912+
tqdm_serial_run(collect_sample_markers, collect_sample_markers_params,
908913
"Collecting extracted markers",
909914
"Collection of extracted markers finished",
910915
"source", show_less)
916+
if max_paralogs == -1:
917+
max_paralog_rank = get_max_paralog_rank(collect_sample_markers_params)
918+
else:
919+
max_paralog_rank = max_paralogs
911920

912921
# Write FASTAs compiled per marker when they have at least four samples
913922
log.log("")
@@ -937,36 +946,37 @@ def collect_extracted_markers(
937946
f" {skipped} already existed and were skipped"
938947
)
939948

940-
# Add references to all the possible collected FASTAs
941-
if refs_paths is not None:
942-
refs = [
943-
refs_paths["NUC"]["AA_path"], refs_paths["NUC"]["NT_path"],
944-
refs_paths["PTD"]["AA_path"], refs_paths["PTD"]["NT_path"],
945-
refs_paths["MIT"]["AA_path"], refs_paths["MIT"]["NT_path"],
946-
refs_paths["DNA"]["NT_path"], refs_paths["DNA"]["NT_path"],
947-
refs_paths["CLR"]["NT_path"], refs_paths["CLR"]["NT_path"],
948-
]
949-
mrks = ["NUC", "NUC", "PTD", "PTD", "MIT", "MIT", "DNA", "DNA", "CLR", "CLR"]
950-
fmts = [ "AA", "NT", "AA", "NT", "AA", "NT", "MA", "MF", "MA", "MF"]
951-
add_refs_params = []
952-
manager = Manager()
953-
shared_ref_names = manager.list()
954-
for r, m, f in zip(refs, mrks, fmts):
955-
if all([r, m in markers.upper().split(","), f in formats.upper().split(",")]):
956-
add_refs_params.append((
957-
r,
958-
Path(out_dir, base_dir, settings.MARKER_DIRS[m], settings.FORMAT_DIRS[f]),
959-
shared_ref_names,
960-
))
961-
if bool(add_refs_params):
962-
log.log("")
963-
tqdm_serial_run(add_refs, add_refs_params,
964-
"Adding reference markers", "Addition of reference markers finished",
965-
"reference", show_less)
949+
manager = Manager()
950+
shared_ref_names = manager.list()
951+
if not collect_only:
952+
# Add references to all the possible collected FASTAs
953+
if refs_paths is not None:
954+
refs = [
955+
refs_paths["NUC"]["AA_path"], refs_paths["NUC"]["NT_path"],
956+
refs_paths["PTD"]["AA_path"], refs_paths["PTD"]["NT_path"],
957+
refs_paths["MIT"]["AA_path"], refs_paths["MIT"]["NT_path"],
958+
refs_paths["DNA"]["NT_path"], refs_paths["DNA"]["NT_path"],
959+
refs_paths["CLR"]["NT_path"], refs_paths["CLR"]["NT_path"],
960+
]
961+
mrks = ["NUC", "NUC", "PTD", "PTD", "MIT", "MIT", "DNA", "DNA", "CLR", "CLR"]
962+
fmts = [ "AA", "NT", "AA", "NT", "AA", "NT", "MA", "MF", "MA", "MF"]
963+
add_refs_params = []
964+
for r, m, f in zip(refs, mrks, fmts):
965+
if all([r, m in markers.upper().split(","), f in formats.upper().split(",")]):
966+
add_refs_params.append((
967+
r,
968+
Path(out_dir, base_dir, settings.MARKER_DIRS[m], settings.FORMAT_DIRS[f]),
969+
shared_ref_names,
970+
))
971+
if bool(add_refs_params):
972+
log.log("")
973+
tqdm_serial_run(add_refs, add_refs_params,
974+
"Adding reference markers", "Addition of reference markers finished",
975+
"reference", show_less)
966976

967977
# Write ASTRAL-Pro sequence to sample equivalence tsv file
968978
astral_pro_tsv = write_astral_pro_seq_to_sam(out_dir,
969-
max_paralogs,
979+
max_paralog_rank,
970980
shared_ref_names,
971981
sample_names)
972982
if astral_pro_tsv:
@@ -1013,6 +1023,22 @@ def collect_sample_markers(
10131023
return message
10141024

10151025

1026+
def get_max_paralog_rank(collect_sample_markers_params: list):
1027+
max_paralog_rank = 0
1028+
for params in collect_sample_markers_params:
1029+
fasta_in = fasta_to_dict(params[1])
1030+
for seq_name_full in fasta_in:
1031+
seq_name_parts = seq_name_full.split(settings.SEQ_NAME_SEP)
1032+
seq_name = seq_name_parts[0]
1033+
if len(seq_name_parts) == 3:
1034+
paralog_rank = int(seq_name_parts[2])
1035+
else:
1036+
paralog_rank = 0
1037+
if paralog_rank > max_paralog_rank:
1038+
max_paralog_rank = paralog_rank
1039+
return max_paralog_rank
1040+
1041+
10161042
def add_refs(ref_path, dest_dir, shared_ref_names):
10171043
start = time.time()
10181044
fastas_in_dest = list(Path(dest_dir).rglob("*.f[an]a"))

captus/captus_assembly.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -1142,7 +1142,7 @@ def align(self):
11421142
help="Maximum number of secondary hits (copies) per sample to import from the"
11431143
" extraction step. Large numbers of marker copies per sample can increase"
11441144
" alignment times. Hits (copies) are ranked from best to worst during the"
1145-
" 'extract' step. -1 disables the initial removal of paralogs and aligns which"
1145+
" 'extract' step. -1 disables the removal of paralogs and aligns them all, which"
11461146
" might be useful if you expect very high ploidy levels for example"
11471147
)
11481148
input_group.add_argument(
@@ -1306,6 +1306,13 @@ def align(self):
13061306
)
13071307

13081308
other_group = parser.add_argument_group("Other")
1309+
other_group.add_argument(
1310+
"--collect_only",
1311+
action="store_true",
1312+
dest="collect_only",
1313+
help="Only collect the markers from the extraction folder and exit (skips addition of"
1314+
" reference target sequences and subsequent steps)"
1315+
)
13091316
other_group.add_argument(
13101317
"--redo_from",
13111318
action="store",

docs/content/assembly/align/options.md

+3
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,9 @@ This argument is optional, the default is **0**.
154154
___
155155
## *Other*
156156
___
157+
### **`--collect_only`**
158+
Only collect the markers from the extraction folder and exit, it skips the addition of reference target sequences and subsequent steps
159+
___
157160
### **`--redo_from`**
158161
You can repeat the analysis without undoing all the steps. These are the points from which you can restart the `align` command:
159162
- `alignment` = Delete all subdirectories with alignments and restart.

0 commit comments

Comments
 (0)