trinath3
diff --git a/‎gradio_app_sdxl_specific_id_low_vram.py
+116-115 b/‎gradio_app_sdxl_specific_id_low_vram.py
+116-115
@@ -21,7 +21,7 @@
         AttnProcessor2_0 as AttnProcessor
 else:
     from utils.gradio_utils  import AttnProcessor
-
+import datetime
 import diffusers
 from diffusers import StableDiffusionXLPipeline
 from utils import PhotoMakerStableDiffusionXLPipeline
@@ -181,83 +181,6 @@ def __call__(
             cur_step += 1
             indices1024,indices4096 = cal_attn_indice_xl_effcient_memory(self.total_length,self.id_length,sa32,sa64,height,width, device=self.device, dtype= self.dtype)
 
-        return hidden_states
-    def __call1__(
-        self,
-        attn,
-        hidden_states,
-        encoder_hidden_states=None,
-        attention_mask=None,
-        temb=None,
-        attn_indices = None,
-    ):
-        # print("hidden state shape",hidden_states.shape,self.id_length)
-        residual = hidden_states
-        # if encoder_hidden_states is not None:
-        #     raise Exception("not implement")
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-        input_ndim = hidden_states.ndim
-
-        if input_ndim == 4:
-            total_batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(total_batch_size, channel, height * width).transpose(1, 2)
-        total_batch_size,nums_token,channel = hidden_states.shape
-        img_nums = total_batch_size//2
-        hidden_states = hidden_states.view(-1,img_nums,nums_token,channel).reshape(-1,img_nums * nums_token,channel)
-        batch_size, sequence_length, _ = hidden_states.shape
-
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-
-        query = attn.to_q(hidden_states)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states   # B, N, C
-        else:
-            encoder_hidden_states = encoder_hidden_states.view(-1,self.id_length+1,nums_token,channel).reshape(-1,(self.id_length+1) * nums_token,channel)
-
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-
-
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // attn.heads
-
-        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
-        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        # print(key.shape,value.shape,query.shape,attention_mask.shape)
-        # the output of sdp = (batch, num_heads, seq_len, head_dim)
-        # TODO: add support for attn.scale when we move to Torch 2.1
-        #print(query.shape,key.shape,value.shape,attention_mask.shape)
-        hidden_states = F.scaled_dot_product_attention(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
-        )
-
-        hidden_states = hidden_states.transpose(1, 2).reshape(total_batch_size, -1, attn.heads * head_dim)
-        hidden_states = hidden_states.to(query.dtype)
-
-
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        # if input_ndim == 4:
-        #     tile_hidden_states = tile_hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-        # if attn.residual_connection:
-        #     tile_hidden_states = tile_hidden_states + residual
-
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(total_batch_size, channel, height, width)
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-        hidden_states = hidden_states / attn.rescale_output_factor
-        # print(hidden_states.shape)
         return hidden_states
     def __call2__(
         self,
@@ -393,6 +316,60 @@ def set_attention_processor(unet,id_length,is_ipadapter = False):
 <style>
 '''
 
+def save_single_character_weights(unet,character,description, filepath):
+    """
+    保存 attention_processor 类中的 id_bank GPU Tensor 列表到指定文件中。
+    参数:
+    - model: 包含 attention_processor 类实例的模型。
+    - filepath: 权重要保存到的文件路径。
+    """
+    weights_to_save = {}
+    weights_to_save["description"] = description
+    weights_to_save["character"]  = character
+    for attn_name, attn_processor in unet.attn_processors.items():
+        if isinstance(attn_processor, SpatialAttnProcessor2_0):
+                # 将每个 Tensor 转到 CPU 并转为列表，以确保它可以被序列化
+                weights_to_save[attn_name] = {}
+                for step_key in attn_processor.id_bank[character].keys():
+                    weights_to_save[attn_name][step_key] = [tensor.cpu() for tensor in attn_processor.id_bank[character][step_key]]
+    # 使用torch.save保存权重
+    torch.save(weights_to_save, filepath)
+
+def load_single_character_weights(unet, filepath):
+    """
+    从指定文件中加载权重到 attention_processor 类的 id_bank 中。
+    参数:
+    - model: 包含 attention_processor 类实例的模型。
+    - filepath: 权重文件的路径。
+    """
+    # 使用torch.load来读取权重
+    weights_to_load = torch.load(filepath, map_location=torch.device('cpu'))
+    character = weights_to_load['character']
+    description = weights_to_load['description']
+    for attn_name, attn_processor in unet.attn_processors.items():
+        if isinstance(attn_processor, SpatialAttnProcessor2_0):
+            # 转移权重到GPU（如果GPU可用的话）并赋值给id_bank
+            attn_processor.id_bank[character] = {}
+            for step_key in weights_to_load[attn_name].keys():
+                attn_processor.id_bank[character][step_key] = [tensor.to(unet.device) for tensor in weights_to_load[attn_name][step_key]]
+
+def save_results(unet,img_list):
+
+    timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
+    folder_name = f'results/{timestamp}'
+    weight_folder_name = f'{folder_name}/weights'
+    # 创建文件夹
+    if not os.path.exists(folder_name):
+        os.makedirs(folder_name)
+        os.makedirs(weight_folder_name)
+
+    for idx, img in enumerate(img_list):
+        file_path = os.path.join(folder_name, f'image_{idx}.png')  # 图片文件名
+        img.save(file_path)  
+    global character_dict
+    # for char in character_dict:
+    #     description = character_dict[char]
+    #     save_single_character_weights(unet,char,description,os.path.join(weight_folder_name, f'{char}.pt'))
 
 #################################################
 title = r"""
@@ -426,14 +403,14 @@ def set_attention_processor(unet,id_length,is_ipadapter = False):
 ```
 📋 **License**
 <br>
-The Contents you create are under Apache-2.0 LICENSE. The Code are under Attribution-NonCommercial 4.0 International. 
+Apache-2.0 LICENSE. 
 
 📧 **Contact**
 <br>
 If you have any questions, please feel free to reach me out at <b>[email protected]</b>.
 """
 version = r"""
-<h3 align="center">StoryDiffusion Version 0.01 (test version)</h3>
+<h3 align="center">StoryDiffusion Version 0.02 (test version)</h3>
 
 <h5 >1. Support image ref image. (Cartoon Ref image is not support now)</h5>
 <h5 >2. Support Typesetting Style and Captioning.(By default, the prompt is used as the caption for each image. If you need to change the caption, add a # at the end of each line. Only the part after the # will be added as a caption to the image.)</h5>
@@ -528,9 +505,29 @@ def change_visiale_by_model_type(_model_type):
     else:
         raise ValueError("Invalid model type",_model_type)
 
+def load_character_files(character_files:str):
+    if character_files == "":
+        raise gr.Error("Please set a character file!")
+    character_files_arr = character_files.splitlines()
+    primarytext = []
+    for character_file_name in character_files_arr:
+        character_file = torch.load(character_file_name, map_location=torch.device('cpu'))
+        primarytext.append(character_file["character"] + character_file["description"])
+    return array2string(primarytext)
+
+
+def load_character_files_on_running(unet,character_files:str):
+    if character_files == "":
+        return False
+    character_files_arr = character_files.splitlines()
+    for character_file in character_files_arr:
+        load_single_character_weights(unet, character_file)
+    return True
 
 ######### Image Generation ##############
-def process_generation(_sd_type,_model_type,_upload_images, _num_steps,style_name, _Ip_Adapter_Strength ,_style_strength_ratio, guidance_scale, seed_,  sa32_, sa64_, id_length_,  general_prompt, negative_prompt,prompt_array,G_height,G_width,_comic_type, font_choice): # Corrected font_choice usage
+def process_generation(_sd_type,_model_type,_upload_images, _num_steps,style_name, _Ip_Adapter_Strength ,_style_strength_ratio, guidance_scale, seed_,  sa32_, sa64_, id_length_,  general_prompt, negative_prompt,prompt_array,G_height,G_width,_comic_type, font_choice,_char_files): # Corrected font_choice usage
+    if len(general_prompt.splitlines()) >= 3:
+        raise gr.Error("Support for more than three characters is temporarily unavailable due to VRAM limitations, but this issue will be resolved soon.")
     _model_type = "Photomaker" if _model_type == "Using Ref Images" else "original"
     if _model_type == "Photomaker" and "img" not in general_prompt:
         raise gr.Error("Please add the triger word \" img \"  behind the class word you want to customize, such as: man img or woman img")
@@ -574,17 +571,13 @@ def process_generation(_sd_type,_model_type,_upload_images, _num_steps,style_nam
         unet = pipe.unet
         # unet.set_attn_processor(copy.deepcopy(attn_procs))
 
-
+    load_chars = load_character_files_on_running(unet,character_files=_char_files)
 
     prompts = prompt_array.splitlines()
     global character_dict,character_index_dict,invert_character_index_dict,ref_indexs_dict,ref_totals
     character_dict,character_list  = character_to_dict(general_prompt)
 
 
-
-
-
-
     start_merge_step = int(float(_style_strength_ratio) / 100 * _num_steps)
     if start_merge_step > 30:
         start_merge_step = 30
@@ -627,33 +620,37 @@ def process_generation(_sd_type,_model_type,_upload_images, _num_steps,style_nam
     id_images = []
     results_dict = {}
     global cur_character
-    for character_key in character_dict.keys():
-        cur_character  = [character_key]
-        ref_indexs = ref_indexs_dict[character_key]
-        print(character_key,ref_indexs)
-        current_prompts = [replace_prompts[ref_ind] for ref_ind in  ref_indexs]
-        print(current_prompts)
-        setup_seed(seed_)
-        generator = torch.Generator(device="cuda").manual_seed(seed_)
-        cur_step = 0
-        cur_positive_prompts, negative_prompt = apply_style(style_name, current_prompts, negative_prompt)
-        if _model_type == "original":
-            id_images = pipe(cur_positive_prompts, num_inference_steps=_num_steps, guidance_scale=guidance_scale,  height = height, width = width,negative_prompt =  negative_prompt,generator = generator).images
-        elif _model_type == "Photomaker":
-            id_images = pipe(cur_positive_prompts,input_id_images=input_id_images_dict[character_key], num_inference_steps=_num_steps, guidance_scale=guidance_scale, start_merge_step = start_merge_step, height = height, width = width,negative_prompt = negative_prompt,generator = generator).images
-        else: 
-            raise NotImplementedError("You should choice between original and Photomaker!",f"But you choice {_model_type}")
-        
-        # total_results = id_images + total_results
-        # yield total_results
-        print(id_images)
-        for ind,img in enumerate(id_images):
-            print(ref_indexs[ind])
-            results_dict[ref_indexs[ind]] = img
-        # real_images = []
+    if not load_chars:
+        for character_key in character_dict.keys():
+            cur_character  = [character_key]
+            ref_indexs = ref_indexs_dict[character_key]
+            print(character_key,ref_indexs)
+            current_prompts = [replace_prompts[ref_ind] for ref_ind in  ref_indexs]
+            print(current_prompts)
+            setup_seed(seed_)
+            generator = torch.Generator(device="cuda").manual_seed(seed_)
+            cur_step = 0
+            cur_positive_prompts, negative_prompt = apply_style(style_name, current_prompts, negative_prompt)
+            if _model_type == "original":
+                id_images = pipe(cur_positive_prompts, num_inference_steps=_num_steps, guidance_scale=guidance_scale,  height = height, width = width,negative_prompt =  negative_prompt,generator = generator).images
+            elif _model_type == "Photomaker":
+                id_images = pipe(cur_positive_prompts,input_id_images=input_id_images_dict[character_key], num_inference_steps=_num_steps, guidance_scale=guidance_scale, start_merge_step = start_merge_step, height = height, width = width,negative_prompt = negative_prompt,generator = generator).images
+            else: 
+                raise NotImplementedError("You should choice between original and Photomaker!",f"But you choice {_model_type}")
+            
+            # total_results = id_images + total_results
+            # yield total_results
+            print(id_images)
+            for ind,img in enumerate(id_images):
+                print(ref_indexs[ind])
+                results_dict[ref_indexs[ind]] = img
+            # real_images = []
+            yield [results_dict[ind] for ind in results_dict.keys()]
     write = False
-
-    real_prompts_inds =  [ind for ind in range(len(prompts)) if ind not in ref_totals]
+    if not load_chars:
+        real_prompts_inds =  [ind for ind in range(len(prompts)) if ind not in ref_totals]
+    else:
+        real_prompts_inds =  [ind for ind in range(len(prompts))]
     print(real_prompts_inds)
 
     for real_prompts_ind in real_prompts_inds:
@@ -672,7 +669,7 @@ def process_generation(_sd_type,_model_type,_upload_images, _num_steps,style_nam
             results_dict[real_prompts_ind] = (pipe(real_prompt, input_id_images=input_id_images_dict[cur_character[0]] if real_prompts_ind not in nc_indexs else input_id_images_dict[character_list[0]], num_inference_steps=_num_steps, guidance_scale=guidance_scale,  start_merge_step = start_merge_step, height = height, width = width,negative_prompt = negative_prompt,generator = generator,nc_flag = True if real_prompts_ind in nc_indexs else False).images[0])
         else:
             raise NotImplementedError("You should choice between original and Photomaker!",f"But you choice {_model_type}")
-        
+        yield [results_dict[ind] for ind in results_dict.keys()]
     total_results = [results_dict[ind] for ind in range(len(prompts))]
     if _comic_type != "No typesetting (default)":
         captions= prompt_array.splitlines()
@@ -684,6 +681,8 @@ def process_generation(_sd_type,_model_type,_upload_images, _num_steps,style_nam
         print(f"Attempting to load font from path: {font_path}")
         font = ImageFont.truetype(font_path, int(45))
     total_results = get_comic(total_results, _comic_type, captions=captions, font=font) + total_results
+    save_results(pipe.unet,total_results)
+
     yield total_results
 
 
@@ -731,6 +730,8 @@ def array2string(arr):
                 negative_prompt = gr.Textbox(value='', label="(2) Negative_prompt", interactive=True)
                 style = gr.Dropdown(label="Style template", choices=STYLE_NAMES, value=DEFAULT_STYLE_NAME)
                 prompt_array = gr.Textbox(lines = 3,value='', label="(3) Comic Description (each line corresponds to a frame).", interactive=True)
+                char_path = gr.Textbox(lines = 2,value='', visible = False,label="(Optional) Character files", interactive=True)
+                char_btn = gr.Button("Load Character files",visible = False)
                 with gr.Accordion("(4) Tune the hyperparameters", open=True):
                     font_choice = gr.Dropdown(label="Select Font", choices=[f for f in os.listdir("./fonts") if f.endswith('.ttf')], value="Inkfree.ttf", info="Select font for the final slide.", interactive=True)
                     sa32_ = gr.Slider(label=" (The degree of Paired Attention at 32 x 32 self-attention layers) ", minimum=0, maximum=1., value=0.5, step=0.1)
@@ -792,9 +793,9 @@ def array2string(arr):
     model_type.change(fn = change_visiale_by_model_type , inputs = model_type, outputs=[control_image_input,style_strength_ratio,Ip_Adapter_Strength])
     files.upload(fn=swap_to_gallery, inputs=files, outputs=[uploaded_files, clear_button, files])
     remove_and_reupload.click(fn=remove_back_to_files, outputs=[uploaded_files, clear_button, files])
-
+    char_btn.click(fn=load_character_files,inputs=char_path,outputs=[general_prompt])
     final_run_btn.click(fn=set_text_unfinished, outputs=generated_information
-    ).then(process_generation, inputs=[sd_type,model_type,files, num_steps,style, Ip_Adapter_Strength,style_strength_ratio, guidance_scale, seed_, sa32_, sa64_, id_length_, general_prompt, negative_prompt, prompt_array,G_height,G_width,comic_type, font_choice], outputs=out_image
+    ).then(process_generation, inputs=[sd_type,model_type,files, num_steps,style, Ip_Adapter_Strength,style_strength_ratio, guidance_scale, seed_, sa32_, sa64_, id_length_, general_prompt, negative_prompt, prompt_array,G_height,G_width,comic_type, font_choice,char_path], outputs=out_image
     ).then(fn=set_text_finished,outputs=generated_information)