@@ -172,9 +172,14 @@ def align(full_command, args):
172
172
173
173
collect_extracted_markers (markers , formats , args .max_paralogs , args .min_samples ,
174
174
extracted_sample_dirs , out_dir , settings .ALN_DIRS ["UNAL" ],
175
- refs_paths , args .overwrite , show_less )
175
+ refs_paths , args .collect_only , args . overwrite , show_less )
176
176
log .log ("" )
177
177
178
+ if args .collect_only :
179
+ successful_exit (
180
+ "Captus-assembly: ALIGN -> successfully completed, '--collect_only' was enabled"
181
+ f" [{ elapsed_time (time .time () - captus_start )} ]"
182
+ )
178
183
179
184
180
185
################################################################################################
@@ -858,7 +863,7 @@ def make_output_dirtree(markers, formats, out_dir, base_dir, margin):
858
863
859
864
def collect_extracted_markers (
860
865
markers , formats , max_paralogs , min_samples , extracted_sample_dirs , out_dir , base_dir ,
861
- refs_paths , overwrite , show_less
866
+ refs_paths , collect_only , overwrite , show_less
862
867
):
863
868
source_files = [Path (settings .MARKER_DIRS [m ], f"{ m } { settings .FORMAT_SUFFIXES [f ]} " )
864
869
for m in markers .split ("," ) for f in formats .split ("," )
@@ -894,20 +899,24 @@ def collect_extracted_markers(
894
899
895
900
# Collect markers per sample's source FASTAs into a dictionary that can be updated in parallel
896
901
fastas_per_marker = {}
897
- collect_marker_names_params = []
902
+ collect_sample_markers_params = []
898
903
for source_fasta_path in fastas_per_sample :
899
- collect_marker_names_params .append ([
904
+ collect_sample_markers_params .append ([
900
905
fastas_per_marker ,
901
906
source_fasta_path ,
902
907
fastas_per_sample [source_fasta_path ]["sample_name" ],
903
908
fastas_per_sample [source_fasta_path ]["destination" ],
904
909
fastas_per_sample [source_fasta_path ]["suffix" ],
905
910
max_paralogs ,
906
911
])
907
- tqdm_serial_run (collect_sample_markers , collect_marker_names_params ,
912
+ tqdm_serial_run (collect_sample_markers , collect_sample_markers_params ,
908
913
"Collecting extracted markers" ,
909
914
"Collection of extracted markers finished" ,
910
915
"source" , show_less )
916
+ if max_paralogs == - 1 :
917
+ max_paralog_rank = get_max_paralog_rank (collect_sample_markers_params )
918
+ else :
919
+ max_paralog_rank = max_paralogs
911
920
912
921
# Write FASTAs compiled per marker when they have at least four samples
913
922
log .log ("" )
@@ -937,36 +946,37 @@ def collect_extracted_markers(
937
946
f" { skipped } already existed and were skipped"
938
947
)
939
948
940
- # Add references to all the possible collected FASTAs
941
- if refs_paths is not None :
942
- refs = [
943
- refs_paths ["NUC" ]["AA_path" ], refs_paths ["NUC" ]["NT_path" ],
944
- refs_paths ["PTD" ]["AA_path" ], refs_paths ["PTD" ]["NT_path" ],
945
- refs_paths ["MIT" ]["AA_path" ], refs_paths ["MIT" ]["NT_path" ],
946
- refs_paths ["DNA" ]["NT_path" ], refs_paths ["DNA" ]["NT_path" ],
947
- refs_paths ["CLR" ]["NT_path" ], refs_paths ["CLR" ]["NT_path" ],
948
- ]
949
- mrks = ["NUC" , "NUC" , "PTD" , "PTD" , "MIT" , "MIT" , "DNA" , "DNA" , "CLR" , "CLR" ]
950
- fmts = [ "AA" , "NT" , "AA" , "NT" , "AA" , "NT" , "MA" , "MF" , "MA" , "MF" ]
951
- add_refs_params = []
952
- manager = Manager ()
953
- shared_ref_names = manager .list ()
954
- for r , m , f in zip (refs , mrks , fmts ):
955
- if all ([r , m in markers .upper ().split ("," ), f in formats .upper ().split ("," )]):
956
- add_refs_params .append ((
957
- r ,
958
- Path (out_dir , base_dir , settings .MARKER_DIRS [m ], settings .FORMAT_DIRS [f ]),
959
- shared_ref_names ,
960
- ))
961
- if bool (add_refs_params ):
962
- log .log ("" )
963
- tqdm_serial_run (add_refs , add_refs_params ,
964
- "Adding reference markers" , "Addition of reference markers finished" ,
965
- "reference" , show_less )
949
+ manager = Manager ()
950
+ shared_ref_names = manager .list ()
951
+ if not collect_only :
952
+ # Add references to all the possible collected FASTAs
953
+ if refs_paths is not None :
954
+ refs = [
955
+ refs_paths ["NUC" ]["AA_path" ], refs_paths ["NUC" ]["NT_path" ],
956
+ refs_paths ["PTD" ]["AA_path" ], refs_paths ["PTD" ]["NT_path" ],
957
+ refs_paths ["MIT" ]["AA_path" ], refs_paths ["MIT" ]["NT_path" ],
958
+ refs_paths ["DNA" ]["NT_path" ], refs_paths ["DNA" ]["NT_path" ],
959
+ refs_paths ["CLR" ]["NT_path" ], refs_paths ["CLR" ]["NT_path" ],
960
+ ]
961
+ mrks = ["NUC" , "NUC" , "PTD" , "PTD" , "MIT" , "MIT" , "DNA" , "DNA" , "CLR" , "CLR" ]
962
+ fmts = [ "AA" , "NT" , "AA" , "NT" , "AA" , "NT" , "MA" , "MF" , "MA" , "MF" ]
963
+ add_refs_params = []
964
+ for r , m , f in zip (refs , mrks , fmts ):
965
+ if all ([r , m in markers .upper ().split ("," ), f in formats .upper ().split ("," )]):
966
+ add_refs_params .append ((
967
+ r ,
968
+ Path (out_dir , base_dir , settings .MARKER_DIRS [m ], settings .FORMAT_DIRS [f ]),
969
+ shared_ref_names ,
970
+ ))
971
+ if bool (add_refs_params ):
972
+ log .log ("" )
973
+ tqdm_serial_run (add_refs , add_refs_params ,
974
+ "Adding reference markers" , "Addition of reference markers finished" ,
975
+ "reference" , show_less )
966
976
967
977
# Write ASTRAL-Pro sequence to sample equivalence tsv file
968
978
astral_pro_tsv = write_astral_pro_seq_to_sam (out_dir ,
969
- max_paralogs ,
979
+ max_paralog_rank ,
970
980
shared_ref_names ,
971
981
sample_names )
972
982
if astral_pro_tsv :
@@ -1013,6 +1023,22 @@ def collect_sample_markers(
1013
1023
return message
1014
1024
1015
1025
1026
+ def get_max_paralog_rank (collect_sample_markers_params : list ):
1027
+ max_paralog_rank = 0
1028
+ for params in collect_sample_markers_params :
1029
+ fasta_in = fasta_to_dict (params [1 ])
1030
+ for seq_name_full in fasta_in :
1031
+ seq_name_parts = seq_name_full .split (settings .SEQ_NAME_SEP )
1032
+ seq_name = seq_name_parts [0 ]
1033
+ if len (seq_name_parts ) == 3 :
1034
+ paralog_rank = int (seq_name_parts [2 ])
1035
+ else :
1036
+ paralog_rank = 0
1037
+ if paralog_rank > max_paralog_rank :
1038
+ max_paralog_rank = paralog_rank
1039
+ return max_paralog_rank
1040
+
1041
+
1016
1042
def add_refs (ref_path , dest_dir , shared_ref_names ):
1017
1043
start = time .time ()
1018
1044
fastas_in_dest = list (Path (dest_dir ).rglob ("*.f[an]a" ))
0 commit comments