diff --git a/.github/workflows/samples_connections.yml b/.github/workflows/samples_connections.yml
index 023251d4d4a..707f4598741 100644
--- a/.github/workflows/samples_connections.yml
+++ b/.github/workflows/samples_connections.yml
@@ -27,8 +27,10 @@ jobs:
       - name: Prepare requirements
         working-directory: examples/connections
         run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
+          if [[ -e requirements.txt ]]; then
+            python -m pip install --upgrade pip
+            pip install -r requirements.txt
+          fi
       - name: Prepare dev requirements
         working-directory: examples
         run: |
@@ -57,10 +59,14 @@ jobs:
         uses: azure/login@v1
         with:
           creds: ${{ secrets.AZURE_CREDENTIALS }}
-      - name: Extract Steps
+      - name: Extract Steps examples/connections/README.md
         working-directory: ${{ github.workspace }}
         run: |
           python scripts/readme/extract_steps_from_readme.py -f examples/connections/README.md -o examples/connections
+      - name: Cat script
+        working-directory: examples/connections
+        run: |
+          cat bash_script.sh
       - name: Run scripts
         working-directory: examples/connections
         run: |
diff --git a/.github/workflows/samples_flows_chat_basic_chat.yml b/.github/workflows/samples_flows_chat_basic_chat.yml
index a399f370f94..1013e485921 100644
--- a/.github/workflows/samples_flows_chat_basic_chat.yml
+++ b/.github/workflows/samples_flows_chat_basic_chat.yml
@@ -27,8 +27,10 @@ jobs:
       - name: Prepare requirements
         working-directory: examples/flows/chat/basic-chat
         run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
+          if [[ -e requirements.txt ]]; then
+            python -m pip install --upgrade pip
+            pip install -r requirements.txt
+          fi
       - name: Prepare dev requirements
         working-directory: examples
         run: |
@@ -57,10 +59,14 @@ jobs:
         uses: azure/login@v1
         with:
           creds: ${{ secrets.AZURE_CREDENTIALS }}
-      - name: Extract Steps
+      - name: Extract Steps examples/flows/chat/basic-chat/README.md
         working-directory: ${{ github.workspace }}
         run: |
           python scripts/readme/extract_steps_from_readme.py -f examples/flows/chat/basic-chat/README.md -o examples/flows/chat/basic-chat
+      - name: Cat script
+        working-directory: examples/flows/chat/basic-chat
+        run: |
+          cat bash_script.sh
       - name: Run scripts
         working-directory: examples/flows/chat/basic-chat
         run: |
diff --git a/.github/workflows/samples_flows_chat_chat_with_pdf.yml b/.github/workflows/samples_flows_chat_chat_with_pdf.yml
index 3ca4c00af28..1aaf06a3d80 100644
--- a/.github/workflows/samples_flows_chat_chat_with_pdf.yml
+++ b/.github/workflows/samples_flows_chat_chat_with_pdf.yml
@@ -27,8 +27,10 @@ jobs:
       - name: Prepare requirements
         working-directory: examples/flows/chat/chat-with-pdf
         run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
+          if [[ -e requirements.txt ]]; then
+            python -m pip install --upgrade pip
+            pip install -r requirements.txt
+          fi
       - name: Prepare dev requirements
         working-directory: examples
         run: |
@@ -48,8 +50,11 @@ jobs:
       - name: Create AOAI Connection from ENV file
         working-directory: examples/flows/chat/chat-with-pdf
         run: |
-          pf connection create --file .env --name chat_with_pdf_custom_connection
-          pf connection list
+          if [[ -e .env ]]; then
+            pf connection create --file .env --name chat_with_pdf_custom_connection
+            pf connection list
+          fi
+          
       - name: Create run.yml
         working-directory: examples/flows/chat/chat-with-pdf
         run: |
@@ -62,10 +67,14 @@ jobs:
         uses: azure/login@v1
         with:
           creds: ${{ secrets.AZURE_CREDENTIALS }}
-      - name: Extract Steps
+      - name: Extract Steps examples/flows/chat/chat-with-pdf/README.md
         working-directory: ${{ github.workspace }}
         run: |
           python scripts/readme/extract_steps_from_readme.py -f examples/flows/chat/chat-with-pdf/README.md -o examples/flows/chat/chat-with-pdf
+      - name: Cat script
+        working-directory: examples/flows/chat/chat-with-pdf
+        run: |
+          cat bash_script.sh
       - name: Run scripts
         working-directory: examples/flows/chat/chat-with-pdf
         run: |
diff --git a/.github/workflows/samples_flows_chat_chat_with_wikipedia.yml b/.github/workflows/samples_flows_chat_chat_with_wikipedia.yml
index 343f1584c56..cdd9492debf 100644
--- a/.github/workflows/samples_flows_chat_chat_with_wikipedia.yml
+++ b/.github/workflows/samples_flows_chat_chat_with_wikipedia.yml
@@ -27,8 +27,10 @@ jobs:
       - name: Prepare requirements
         working-directory: examples/flows/chat/chat-with-wikipedia
         run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
+          if [[ -e requirements.txt ]]; then
+            python -m pip install --upgrade pip
+            pip install -r requirements.txt
+          fi
       - name: Prepare dev requirements
         working-directory: examples
         run: |
@@ -57,10 +59,14 @@ jobs:
         uses: azure/login@v1
         with:
           creds: ${{ secrets.AZURE_CREDENTIALS }}
-      - name: Extract Steps
+      - name: Extract Steps examples/flows/chat/chat-with-wikipedia/README.md
         working-directory: ${{ github.workspace }}
         run: |
           python scripts/readme/extract_steps_from_readme.py -f examples/flows/chat/chat-with-wikipedia/README.md -o examples/flows/chat/chat-with-wikipedia
+      - name: Cat script
+        working-directory: examples/flows/chat/chat-with-wikipedia
+        run: |
+          cat bash_script.sh
       - name: Run scripts
         working-directory: examples/flows/chat/chat-with-wikipedia
         run: |
diff --git a/.github/workflows/samples_flows_chat_chatwithpdf_chatwithpdf.yml b/.github/workflows/samples_flows_chat_chatwithpdf_chatwithpdf.yml
index 4e1a934d8b6..c2dbd6a39a4 100644
--- a/.github/workflows/samples_flows_chat_chatwithpdf_chatwithpdf.yml
+++ b/.github/workflows/samples_flows_chat_chatwithpdf_chatwithpdf.yml
@@ -34,7 +34,7 @@ jobs:
           python -m pip install --upgrade pip
           pip install -r ${{ github.workspace }}/examples/requirements.txt
           pip install -r ${{ github.workspace }}/examples/dev_requirements.txt
-      - name: Create Aoai Connection
+      - name: Create Chat With PDF Custom Connection
         working-directory: examples/flows/chat/chat-with-pdf
         run: |
           AOAI_API_KEY=${{ secrets.AOAI_API_KEY_TEST }}
@@ -46,6 +46,14 @@ jobs:
             mv .env.example .env
             pf connection create --file .env --name chat_with_pdf_custom_connection
           fi
+      - name: Create AOAI Connection
+        working-directory: examples/flows/chat/chat-with-pdf
+        run: |
+          AOAI_API_KEY=${{ secrets.AOAI_API_KEY_TEST }}
+          AOAI_API_ENDPOINT=${{ secrets.AOAI_API_ENDPOINT_TEST }}
+          if [[ -e azure_openai.yml ]]; then
+            pf connection create --file azure_openai.yml --set api_key=$AOAI_API_KEY api_base=$AOAI_API_ENDPOINT
+          fi
       - name: Azure Login
         uses: azure/login@v1
         with:
diff --git a/.github/workflows/samples_flows_evaluation_basic_eval.yml b/.github/workflows/samples_flows_evaluation_basic_eval.yml
index 6453935eb21..518a131cbbc 100644
--- a/.github/workflows/samples_flows_evaluation_basic_eval.yml
+++ b/.github/workflows/samples_flows_evaluation_basic_eval.yml
@@ -27,8 +27,10 @@ jobs:
       - name: Prepare requirements
         working-directory: examples/flows/evaluation/basic-eval
         run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
+          if [[ -e requirements.txt ]]; then
+            python -m pip install --upgrade pip
+            pip install -r requirements.txt
+          fi
       - name: Prepare dev requirements
         working-directory: examples
         run: |
@@ -57,10 +59,14 @@ jobs:
         uses: azure/login@v1
         with:
           creds: ${{ secrets.AZURE_CREDENTIALS }}
-      - name: Extract Steps
+      - name: Extract Steps examples/flows/evaluation/basic-eval/README.md
         working-directory: ${{ github.workspace }}
         run: |
           python scripts/readme/extract_steps_from_readme.py -f examples/flows/evaluation/basic-eval/README.md -o examples/flows/evaluation/basic-eval
+      - name: Cat script
+        working-directory: examples/flows/evaluation/basic-eval
+        run: |
+          cat bash_script.sh
       - name: Run scripts
         working-directory: examples/flows/evaluation/basic-eval
         run: |
diff --git a/.github/workflows/samples_flows_evaluation_classification_accuracy_eval.yml b/.github/workflows/samples_flows_evaluation_classification_accuracy_eval.yml
index bd209a27535..d99dd5cc5b9 100644
--- a/.github/workflows/samples_flows_evaluation_classification_accuracy_eval.yml
+++ b/.github/workflows/samples_flows_evaluation_classification_accuracy_eval.yml
@@ -27,8 +27,10 @@ jobs:
       - name: Prepare requirements
         working-directory: examples/flows/evaluation/classification-accuracy-eval
         run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
+          if [[ -e requirements.txt ]]; then
+            python -m pip install --upgrade pip
+            pip install -r requirements.txt
+          fi
       - name: Prepare dev requirements
         working-directory: examples
         run: |
@@ -57,10 +59,14 @@ jobs:
         uses: azure/login@v1
         with:
           creds: ${{ secrets.AZURE_CREDENTIALS }}
-      - name: Extract Steps
+      - name: Extract Steps examples/flows/evaluation/classification-accuracy-eval/README.md
         working-directory: ${{ github.workspace }}
         run: |
           python scripts/readme/extract_steps_from_readme.py -f examples/flows/evaluation/classification-accuracy-eval/README.md -o examples/flows/evaluation/classification-accuracy-eval
+      - name: Cat script
+        working-directory: examples/flows/evaluation/classification-accuracy-eval
+        run: |
+          cat bash_script.sh
       - name: Run scripts
         working-directory: examples/flows/evaluation/classification-accuracy-eval
         run: |
diff --git a/.github/workflows/samples_flows_evaluation_eval_entity_match_rate.yml b/.github/workflows/samples_flows_evaluation_eval_entity_match_rate.yml
new file mode 100644
index 00000000000..02a0baf40f9
--- /dev/null
+++ b/.github/workflows/samples_flows_evaluation_eval_entity_match_rate.yml
@@ -0,0 +1,84 @@
+# This code is autogenerated.
+# Code is generated by running custom script: python3 readme.py
+# Any manual changes to this file may cause incorrect behavior.
+# Any manual changes will be overwritten if the code is regenerated.
+
+name: samples_flows_evaluation_eval_entity_match_rate
+on:
+  schedule:
+    - cron: "10 22 * * *" # Every day starting at 6:10 BJT
+  pull_request:
+    branches: [ main ]
+    paths: [ examples/** ]
+  workflow_dispatch:
+
+jobs:
+  samples_readme_ci:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Setup Python 3.9 environment
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.9"
+      - name: Generate config.json
+        run: echo ${{ secrets.TEST_WORKSPACE_CONFIG_JSON }} > ${{ github.workspace }}/examples/config.json
+      - name: Prepare requirements
+        working-directory: examples/flows/evaluation/eval-entity-match-rate
+        run: |
+          if [[ -e requirements.txt ]]; then
+            python -m pip install --upgrade pip
+            pip install -r requirements.txt
+          fi
+      - name: Prepare dev requirements
+        working-directory: examples
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r dev_requirements.txt
+      - name: Refine .env file
+        working-directory: examples/flows/evaluation/eval-entity-match-rate
+        run: |
+          AOAI_API_KEY=${{ secrets.AOAI_API_KEY_TEST }}
+          AOAI_API_ENDPOINT=${{ secrets.AOAI_API_ENDPOINT_TEST }}
+          AOAI_API_ENDPOINT=$(echo ${AOAI_API_ENDPOINT//\//\\/})
+          if [[ -e .env.example ]]; then
+            echo "env replacement"
+            sed -i -e "s/<your_AOAI_key>/$AOAI_API_KEY/g" -e "s/<your_AOAI_endpoint>/$AOAI_API_ENDPOINT/g" .env.example
+            mv .env.example .env
+          fi
+      - name: Create run.yml
+        working-directory: examples/flows/evaluation/eval-entity-match-rate
+        run: |
+          gpt_base=${{ secrets.AOAI_API_ENDPOINT_TEST }}
+          gpt_base=$(echo ${gpt_base//\//\\/})
+          if [[ -e run.yml ]]; then
+            sed -i -e "s/\${azure_open_ai_connection.api_key}/${{ secrets.AOAI_API_KEY_TEST }}/g" -e "s/\${azure_open_ai_connection.api_base}/$gpt_base/g" run.yml
+          fi
+      - name: Azure Login
+        uses: azure/login@v1
+        with:
+          creds: ${{ secrets.AZURE_CREDENTIALS }}
+      - name: Extract Steps examples/flows/evaluation/eval-entity-match-rate/README.md
+        working-directory: ${{ github.workspace }}
+        run: |
+          python scripts/readme/extract_steps_from_readme.py -f examples/flows/evaluation/eval-entity-match-rate/README.md -o examples/flows/evaluation/eval-entity-match-rate
+      - name: Cat script
+        working-directory: examples/flows/evaluation/eval-entity-match-rate
+        run: |
+          cat bash_script.sh
+      - name: Run scripts
+        working-directory: examples/flows/evaluation/eval-entity-match-rate
+        run: |
+          export aoai_api_key=${{secrets.AOAI_API_KEY_TEST }}
+          export aoai_api_endpoint=${{ secrets.AOAI_API_ENDPOINT_TEST }}
+          export test_workspace_sub_id=${{ secrets.TEST_WORKSPACE_SUB_ID }}
+          export test_workspace_rg=${{ secrets.TEST_WORKSPACE_RG }}
+          export test_workspace_name=${{ secrets.TEST_WORKSPACE_NAME }}
+          bash bash_script.sh
+      - name: Upload artifact
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: artifact
+          path: examples/flows/evaluation/eval-entity-match-rate/bash_script.sh
\ No newline at end of file
diff --git a/.github/workflows/samples_flows_evaluation_groundedness_eval.yml b/.github/workflows/samples_flows_evaluation_groundedness_eval.yml
new file mode 100644
index 00000000000..1a80c6f6ccc
--- /dev/null
+++ b/.github/workflows/samples_flows_evaluation_groundedness_eval.yml
@@ -0,0 +1,84 @@
+# This code is autogenerated.
+# Code is generated by running custom script: python3 readme.py
+# Any manual changes to this file may cause incorrect behavior.
+# Any manual changes will be overwritten if the code is regenerated.
+
+name: samples_flows_evaluation_groundedness_eval
+on:
+  schedule:
+    - cron: "10 22 * * *" # Every day starting at 6:10 BJT
+  pull_request:
+    branches: [ main ]
+    paths: [ examples/** ]
+  workflow_dispatch:
+
+jobs:
+  samples_readme_ci:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Setup Python 3.9 environment
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.9"
+      - name: Generate config.json
+        run: echo ${{ secrets.TEST_WORKSPACE_CONFIG_JSON }} > ${{ github.workspace }}/examples/config.json
+      - name: Prepare requirements
+        working-directory: examples/flows/evaluation/groundedness-eval
+        run: |
+          if [[ -e requirements.txt ]]; then
+            python -m pip install --upgrade pip
+            pip install -r requirements.txt
+          fi
+      - name: Prepare dev requirements
+        working-directory: examples
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r dev_requirements.txt
+      - name: Refine .env file
+        working-directory: examples/flows/evaluation/groundedness-eval
+        run: |
+          AOAI_API_KEY=${{ secrets.AOAI_API_KEY_TEST }}
+          AOAI_API_ENDPOINT=${{ secrets.AOAI_API_ENDPOINT_TEST }}
+          AOAI_API_ENDPOINT=$(echo ${AOAI_API_ENDPOINT//\//\\/})
+          if [[ -e .env.example ]]; then
+            echo "env replacement"
+            sed -i -e "s/<your_AOAI_key>/$AOAI_API_KEY/g" -e "s/<your_AOAI_endpoint>/$AOAI_API_ENDPOINT/g" .env.example
+            mv .env.example .env
+          fi
+      - name: Create run.yml
+        working-directory: examples/flows/evaluation/groundedness-eval
+        run: |
+          gpt_base=${{ secrets.AOAI_API_ENDPOINT_TEST }}
+          gpt_base=$(echo ${gpt_base//\//\\/})
+          if [[ -e run.yml ]]; then
+            sed -i -e "s/\${azure_open_ai_connection.api_key}/${{ secrets.AOAI_API_KEY_TEST }}/g" -e "s/\${azure_open_ai_connection.api_base}/$gpt_base/g" run.yml
+          fi
+      - name: Azure Login
+        uses: azure/login@v1
+        with:
+          creds: ${{ secrets.AZURE_CREDENTIALS }}
+      - name: Extract Steps examples/flows/evaluation/groundedness-eval/README.md
+        working-directory: ${{ github.workspace }}
+        run: |
+          python scripts/readme/extract_steps_from_readme.py -f examples/flows/evaluation/groundedness-eval/README.md -o examples/flows/evaluation/groundedness-eval
+      - name: Cat script
+        working-directory: examples/flows/evaluation/groundedness-eval
+        run: |
+          cat bash_script.sh
+      - name: Run scripts
+        working-directory: examples/flows/evaluation/groundedness-eval
+        run: |
+          export aoai_api_key=${{secrets.AOAI_API_KEY_TEST }}
+          export aoai_api_endpoint=${{ secrets.AOAI_API_ENDPOINT_TEST }}
+          export test_workspace_sub_id=${{ secrets.TEST_WORKSPACE_SUB_ID }}
+          export test_workspace_rg=${{ secrets.TEST_WORKSPACE_RG }}
+          export test_workspace_name=${{ secrets.TEST_WORKSPACE_NAME }}
+          bash bash_script.sh
+      - name: Upload artifact
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: artifact
+          path: examples/flows/evaluation/groundedness-eval/bash_script.sh
\ No newline at end of file
diff --git a/.github/workflows/samples_flows_evaluation_perceived_intelligence_eval.yml b/.github/workflows/samples_flows_evaluation_perceived_intelligence_eval.yml
new file mode 100644
index 00000000000..19e421c6eef
--- /dev/null
+++ b/.github/workflows/samples_flows_evaluation_perceived_intelligence_eval.yml
@@ -0,0 +1,84 @@
+# This code is autogenerated.
+# Code is generated by running custom script: python3 readme.py
+# Any manual changes to this file may cause incorrect behavior.
+# Any manual changes will be overwritten if the code is regenerated.
+
+name: samples_flows_evaluation_perceived_intelligence_eval
+on:
+  schedule:
+    - cron: "47 21 * * *" # Every day starting at 5:47 BJT
+  pull_request:
+    branches: [ main ]
+    paths: [ examples/** ]
+  workflow_dispatch:
+
+jobs:
+  samples_readme_ci:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Setup Python 3.9 environment
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.9"
+      - name: Generate config.json
+        run: echo ${{ secrets.TEST_WORKSPACE_CONFIG_JSON }} > ${{ github.workspace }}/examples/config.json
+      - name: Prepare requirements
+        working-directory: examples/flows/evaluation/perceived-intelligence-eval
+        run: |
+          if [[ -e requirements.txt ]]; then
+            python -m pip install --upgrade pip
+            pip install -r requirements.txt
+          fi
+      - name: Prepare dev requirements
+        working-directory: examples
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r dev_requirements.txt
+      - name: Refine .env file
+        working-directory: examples/flows/evaluation/perceived-intelligence-eval
+        run: |
+          AOAI_API_KEY=${{ secrets.AOAI_API_KEY_TEST }}
+          AOAI_API_ENDPOINT=${{ secrets.AOAI_API_ENDPOINT_TEST }}
+          AOAI_API_ENDPOINT=$(echo ${AOAI_API_ENDPOINT//\//\\/})
+          if [[ -e .env.example ]]; then
+            echo "env replacement"
+            sed -i -e "s/<your_AOAI_key>/$AOAI_API_KEY/g" -e "s/<your_AOAI_endpoint>/$AOAI_API_ENDPOINT/g" .env.example
+            mv .env.example .env
+          fi
+      - name: Create run.yml
+        working-directory: examples/flows/evaluation/perceived-intelligence-eval
+        run: |
+          gpt_base=${{ secrets.AOAI_API_ENDPOINT_TEST }}
+          gpt_base=$(echo ${gpt_base//\//\\/})
+          if [[ -e run.yml ]]; then
+            sed -i -e "s/\${azure_open_ai_connection.api_key}/${{ secrets.AOAI_API_KEY_TEST }}/g" -e "s/\${azure_open_ai_connection.api_base}/$gpt_base/g" run.yml
+          fi
+      - name: Azure Login
+        uses: azure/login@v1
+        with:
+          creds: ${{ secrets.AZURE_CREDENTIALS }}
+      - name: Extract Steps examples/flows/evaluation/perceived-intelligence-eval/README.md
+        working-directory: ${{ github.workspace }}
+        run: |
+          python scripts/readme/extract_steps_from_readme.py -f examples/flows/evaluation/perceived-intelligence-eval/README.md -o examples/flows/evaluation/perceived-intelligence-eval
+      - name: Cat script
+        working-directory: examples/flows/evaluation/perceived-intelligence-eval
+        run: |
+          cat bash_script.sh
+      - name: Run scripts
+        working-directory: examples/flows/evaluation/perceived-intelligence-eval
+        run: |
+          export aoai_api_key=${{secrets.AOAI_API_KEY_TEST }}
+          export aoai_api_endpoint=${{ secrets.AOAI_API_ENDPOINT_TEST }}
+          export test_workspace_sub_id=${{ secrets.TEST_WORKSPACE_SUB_ID }}
+          export test_workspace_rg=${{ secrets.TEST_WORKSPACE_RG }}
+          export test_workspace_name=${{ secrets.TEST_WORKSPACE_NAME }}
+          bash bash_script.sh
+      - name: Upload artifact
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: artifact
+          path: examples/flows/evaluation/perceived-intelligence-eval/bash_script.sh
\ No newline at end of file
diff --git a/.github/workflows/samples_flows_standard_basic.yml b/.github/workflows/samples_flows_standard_basic.yml
index 5eb27d45259..bc764ce1c1a 100644
--- a/.github/workflows/samples_flows_standard_basic.yml
+++ b/.github/workflows/samples_flows_standard_basic.yml
@@ -27,8 +27,10 @@ jobs:
       - name: Prepare requirements
         working-directory: examples/flows/standard/basic
         run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
+          if [[ -e requirements.txt ]]; then
+            python -m pip install --upgrade pip
+            pip install -r requirements.txt
+          fi
       - name: Prepare dev requirements
         working-directory: examples
         run: |
@@ -57,10 +59,14 @@ jobs:
         uses: azure/login@v1
         with:
           creds: ${{ secrets.AZURE_CREDENTIALS }}
-      - name: Extract Steps
+      - name: Extract Steps examples/flows/standard/basic/README.md
         working-directory: ${{ github.workspace }}
         run: |
           python scripts/readme/extract_steps_from_readme.py -f examples/flows/standard/basic/README.md -o examples/flows/standard/basic
+      - name: Cat script
+        working-directory: examples/flows/standard/basic
+        run: |
+          cat bash_script.sh
       - name: Run scripts
         working-directory: examples/flows/standard/basic
         run: |
diff --git a/.github/workflows/samples_flows_standard_basic_with_builtin_llm.yml b/.github/workflows/samples_flows_standard_basic_with_builtin_llm.yml
index 597bdaf4fbf..4d4ff1f7929 100644
--- a/.github/workflows/samples_flows_standard_basic_with_builtin_llm.yml
+++ b/.github/workflows/samples_flows_standard_basic_with_builtin_llm.yml
@@ -27,8 +27,10 @@ jobs:
       - name: Prepare requirements
         working-directory: examples/flows/standard/basic-with-builtin-llm
         run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
+          if [[ -e requirements.txt ]]; then
+            python -m pip install --upgrade pip
+            pip install -r requirements.txt
+          fi
       - name: Prepare dev requirements
         working-directory: examples
         run: |
@@ -60,10 +62,14 @@ jobs:
         uses: azure/login@v1
         with:
           creds: ${{ secrets.AZURE_CREDENTIALS }}
-      - name: Extract Steps
+      - name: Extract Steps examples/flows/standard/basic-with-builtin-llm/README.md
         working-directory: ${{ github.workspace }}
         run: |
           python scripts/readme/extract_steps_from_readme.py -f examples/flows/standard/basic-with-builtin-llm/README.md -o examples/flows/standard/basic-with-builtin-llm
+      - name: Cat script
+        working-directory: examples/flows/standard/basic-with-builtin-llm
+        run: |
+          cat bash_script.sh
       - name: Run scripts
         working-directory: examples/flows/standard/basic-with-builtin-llm
         run: |
diff --git a/.github/workflows/samples_flows_standard_basic_with_connection.yml b/.github/workflows/samples_flows_standard_basic_with_connection.yml
index efe76af3de6..d4d0f034e63 100644
--- a/.github/workflows/samples_flows_standard_basic_with_connection.yml
+++ b/.github/workflows/samples_flows_standard_basic_with_connection.yml
@@ -27,8 +27,10 @@ jobs:
       - name: Prepare requirements
         working-directory: examples/flows/standard/basic-with-connection
         run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
+          if [[ -e requirements.txt ]]; then
+            python -m pip install --upgrade pip
+            pip install -r requirements.txt
+          fi
       - name: Prepare dev requirements
         working-directory: examples
         run: |
@@ -60,10 +62,14 @@ jobs:
         uses: azure/login@v1
         with:
           creds: ${{ secrets.AZURE_CREDENTIALS }}
-      - name: Extract Steps
+      - name: Extract Steps examples/flows/standard/basic-with-connection/README.md
         working-directory: ${{ github.workspace }}
         run: |
           python scripts/readme/extract_steps_from_readme.py -f examples/flows/standard/basic-with-connection/README.md -o examples/flows/standard/basic-with-connection
+      - name: Cat script
+        working-directory: examples/flows/standard/basic-with-connection
+        run: |
+          cat bash_script.sh
       - name: Run scripts
         working-directory: examples/flows/standard/basic-with-connection
         run: |
diff --git a/.github/workflows/samples_flows_standard_flow_with_additional_includes.yml b/.github/workflows/samples_flows_standard_flow_with_additional_includes.yml
index a7a54fc60e7..8cafd973abb 100644
--- a/.github/workflows/samples_flows_standard_flow_with_additional_includes.yml
+++ b/.github/workflows/samples_flows_standard_flow_with_additional_includes.yml
@@ -27,8 +27,10 @@ jobs:
       - name: Prepare requirements
         working-directory: examples/flows/standard/flow-with-additional-includes
         run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
+          if [[ -e requirements.txt ]]; then
+            python -m pip install --upgrade pip
+            pip install -r requirements.txt
+          fi
       - name: Prepare dev requirements
         working-directory: examples
         run: |
@@ -60,10 +62,14 @@ jobs:
         uses: azure/login@v1
         with:
           creds: ${{ secrets.AZURE_CREDENTIALS }}
-      - name: Extract Steps
+      - name: Extract Steps examples/flows/standard/flow-with-additional-includes/README.md
         working-directory: ${{ github.workspace }}
         run: |
           python scripts/readme/extract_steps_from_readme.py -f examples/flows/standard/flow-with-additional-includes/README.md -o examples/flows/standard/flow-with-additional-includes
+      - name: Cat script
+        working-directory: examples/flows/standard/flow-with-additional-includes
+        run: |
+          cat bash_script.sh
       - name: Run scripts
         working-directory: examples/flows/standard/flow-with-additional-includes
         run: |
diff --git a/.github/workflows/samples_flows_standard_flow_with_symlinks.yml b/.github/workflows/samples_flows_standard_flow_with_symlinks.yml
index 00ef9c57a6b..d5ea7d902ca 100644
--- a/.github/workflows/samples_flows_standard_flow_with_symlinks.yml
+++ b/.github/workflows/samples_flows_standard_flow_with_symlinks.yml
@@ -27,8 +27,10 @@ jobs:
       - name: Prepare requirements
         working-directory: examples/flows/standard/flow-with-symlinks
         run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
+          if [[ -e requirements.txt ]]; then
+            python -m pip install --upgrade pip
+            pip install -r requirements.txt
+          fi
       - name: Prepare dev requirements
         working-directory: examples
         run: |
@@ -60,10 +62,14 @@ jobs:
         uses: azure/login@v1
         with:
           creds: ${{ secrets.AZURE_CREDENTIALS }}
-      - name: Extract Steps
+      - name: Extract Steps examples/flows/standard/flow-with-symlinks/README.md
         working-directory: ${{ github.workspace }}
         run: |
           python scripts/readme/extract_steps_from_readme.py -f examples/flows/standard/flow-with-symlinks/README.md -o examples/flows/standard/flow-with-symlinks
+      - name: Cat script
+        working-directory: examples/flows/standard/flow-with-symlinks
+        run: |
+          cat bash_script.sh
       - name: Run scripts
         working-directory: examples/flows/standard/flow-with-symlinks
         run: |
diff --git a/.github/workflows/samples_flows_standard_intent_copilot.yml b/.github/workflows/samples_flows_standard_intent_copilot.yml
index 6c408112f1f..667e909f6ad 100644
--- a/.github/workflows/samples_flows_standard_intent_copilot.yml
+++ b/.github/workflows/samples_flows_standard_intent_copilot.yml
@@ -27,8 +27,10 @@ jobs:
       - name: Prepare requirements
         working-directory: examples/flows/standard/intent-copilot
         run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
+          if [[ -e requirements.txt ]]; then
+            python -m pip install --upgrade pip
+            pip install -r requirements.txt
+          fi
       - name: Prepare dev requirements
         working-directory: examples
         run: |
@@ -57,10 +59,14 @@ jobs:
         uses: azure/login@v1
         with:
           creds: ${{ secrets.AZURE_CREDENTIALS }}
-      - name: Extract Steps
+      - name: Extract Steps examples/flows/standard/intent-copilot/README.md
         working-directory: ${{ github.workspace }}
         run: |
           python scripts/readme/extract_steps_from_readme.py -f examples/flows/standard/intent-copilot/README.md -o examples/flows/standard/intent-copilot
+      - name: Cat script
+        working-directory: examples/flows/standard/intent-copilot
+        run: |
+          cat bash_script.sh
       - name: Run scripts
         working-directory: examples/flows/standard/intent-copilot
         run: |
diff --git a/.github/workflows/samples_flows_standard_named_entity_recognition.yml b/.github/workflows/samples_flows_standard_named_entity_recognition.yml
new file mode 100644
index 00000000000..6fadaaab88a
--- /dev/null
+++ b/.github/workflows/samples_flows_standard_named_entity_recognition.yml
@@ -0,0 +1,84 @@
+# This code is autogenerated.
+# Code is generated by running custom script: python3 readme.py
+# Any manual changes to this file may cause incorrect behavior.
+# Any manual changes will be overwritten if the code is regenerated.
+
+name: samples_flows_standard_named_entity_recognition
+on:
+  schedule:
+    - cron: "11 22 * * *" # Every day starting at 6:11 BJT
+  pull_request:
+    branches: [ main ]
+    paths: [ examples/** ]
+  workflow_dispatch:
+
+jobs:
+  samples_readme_ci:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Setup Python 3.9 environment
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.9"
+      - name: Generate config.json
+        run: echo ${{ secrets.TEST_WORKSPACE_CONFIG_JSON }} > ${{ github.workspace }}/examples/config.json
+      - name: Prepare requirements
+        working-directory: examples/flows/standard/named-entity-recognition
+        run: |
+          if [[ -e requirements.txt ]]; then
+            python -m pip install --upgrade pip
+            pip install -r requirements.txt
+          fi
+      - name: Prepare dev requirements
+        working-directory: examples
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r dev_requirements.txt
+      - name: Refine .env file
+        working-directory: examples/flows/standard/named-entity-recognition
+        run: |
+          AOAI_API_KEY=${{ secrets.AOAI_API_KEY_TEST }}
+          AOAI_API_ENDPOINT=${{ secrets.AOAI_API_ENDPOINT_TEST }}
+          AOAI_API_ENDPOINT=$(echo ${AOAI_API_ENDPOINT//\//\\/})
+          if [[ -e .env.example ]]; then
+            echo "env replacement"
+            sed -i -e "s/<your_AOAI_key>/$AOAI_API_KEY/g" -e "s/<your_AOAI_endpoint>/$AOAI_API_ENDPOINT/g" .env.example
+            mv .env.example .env
+          fi
+      - name: Create run.yml
+        working-directory: examples/flows/standard/named-entity-recognition
+        run: |
+          gpt_base=${{ secrets.AOAI_API_ENDPOINT_TEST }}
+          gpt_base=$(echo ${gpt_base//\//\\/})
+          if [[ -e run.yml ]]; then
+            sed -i -e "s/\${azure_open_ai_connection.api_key}/${{ secrets.AOAI_API_KEY_TEST }}/g" -e "s/\${azure_open_ai_connection.api_base}/$gpt_base/g" run.yml
+          fi
+      - name: Azure Login
+        uses: azure/login@v1
+        with:
+          creds: ${{ secrets.AZURE_CREDENTIALS }}
+      - name: Extract Steps examples/flows/standard/named-entity-recognition/README.md
+        working-directory: ${{ github.workspace }}
+        run: |
+          python scripts/readme/extract_steps_from_readme.py -f examples/flows/standard/named-entity-recognition/README.md -o examples/flows/standard/named-entity-recognition
+      - name: Cat script
+        working-directory: examples/flows/standard/named-entity-recognition
+        run: |
+          cat bash_script.sh
+      - name: Run scripts
+        working-directory: examples/flows/standard/named-entity-recognition
+        run: |
+          export aoai_api_key=${{secrets.AOAI_API_KEY_TEST }}
+          export aoai_api_endpoint=${{ secrets.AOAI_API_ENDPOINT_TEST }}
+          export test_workspace_sub_id=${{ secrets.TEST_WORKSPACE_SUB_ID }}
+          export test_workspace_rg=${{ secrets.TEST_WORKSPACE_RG }}
+          export test_workspace_name=${{ secrets.TEST_WORKSPACE_NAME }}
+          bash bash_script.sh
+      - name: Upload artifact
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: artifact
+          path: examples/flows/standard/named-entity-recognition/bash_script.sh
\ No newline at end of file
diff --git a/.github/workflows/samples_flows_standard_summarizing_film_with_autogpt.yml b/.github/workflows/samples_flows_standard_summarizing_film_with_autogpt.yml
index e0330b4b268..307d5778136 100644
--- a/.github/workflows/samples_flows_standard_summarizing_film_with_autogpt.yml
+++ b/.github/workflows/samples_flows_standard_summarizing_film_with_autogpt.yml
@@ -27,8 +27,10 @@ jobs:
       - name: Prepare requirements
         working-directory: examples/flows/standard/summarizing-film-with-autogpt
         run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
+          if [[ -e requirements.txt ]]; then
+            python -m pip install --upgrade pip
+            pip install -r requirements.txt
+          fi
       - name: Prepare dev requirements
         working-directory: examples
         run: |
@@ -57,10 +59,14 @@ jobs:
         uses: azure/login@v1
         with:
           creds: ${{ secrets.AZURE_CREDENTIALS }}
-      - name: Extract Steps
+      - name: Extract Steps examples/flows/standard/summarizing-film-with-autogpt/README.md
         working-directory: ${{ github.workspace }}
         run: |
           python scripts/readme/extract_steps_from_readme.py -f examples/flows/standard/summarizing-film-with-autogpt/README.md -o examples/flows/standard/summarizing-film-with-autogpt
+      - name: Cat script
+        working-directory: examples/flows/standard/summarizing-film-with-autogpt
+        run: |
+          cat bash_script.sh
       - name: Run scripts
         working-directory: examples/flows/standard/summarizing-film-with-autogpt
         run: |
diff --git a/.github/workflows/samples_flows_standard_web_classification.yml b/.github/workflows/samples_flows_standard_web_classification.yml
index d8a1aed318d..014d08a318c 100644
--- a/.github/workflows/samples_flows_standard_web_classification.yml
+++ b/.github/workflows/samples_flows_standard_web_classification.yml
@@ -27,8 +27,10 @@ jobs:
       - name: Prepare requirements
         working-directory: examples/flows/standard/web-classification
         run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
+          if [[ -e requirements.txt ]]; then
+            python -m pip install --upgrade pip
+            pip install -r requirements.txt
+          fi
       - name: Prepare dev requirements
         working-directory: examples
         run: |
@@ -57,10 +59,14 @@ jobs:
         uses: azure/login@v1
         with:
           creds: ${{ secrets.AZURE_CREDENTIALS }}
-      - name: Extract Steps
+      - name: Extract Steps examples/flows/standard/web-classification/README.md
         working-directory: ${{ github.workspace }}
         run: |
           python scripts/readme/extract_steps_from_readme.py -f examples/flows/standard/web-classification/README.md -o examples/flows/standard/web-classification
+      - name: Cat script
+        working-directory: examples/flows/standard/web-classification
+        run: |
+          cat bash_script.sh
       - name: Run scripts
         working-directory: examples/flows/standard/web-classification
         run: |
diff --git a/.github/workflows/samples_tutorials_e2e_development_chat_with_pdf.yml b/.github/workflows/samples_tutorials_e2e_development_chat_with_pdf.yml
new file mode 100644
index 00000000000..24af5bc4682
--- /dev/null
+++ b/.github/workflows/samples_tutorials_e2e_development_chat_with_pdf.yml
@@ -0,0 +1,92 @@
+# This code is autogenerated.
+# Code is generated by running custom script: python3 readme.py
+# Any manual changes to this file may cause incorrect behavior.
+# Any manual changes will be overwritten if the code is regenerated.
+
+name: samples_tutorials_e2e_development_chat_with_pdf
+on:
+  schedule:
+    - cron: "1 19 * * *" # Every day starting at 3:1 BJT
+  pull_request:
+    branches: [ main ]
+    paths: [ examples/** ]
+  workflow_dispatch:
+
+jobs:
+  samples_readme_ci:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Setup Python 3.9 environment
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.9"
+      - name: Generate config.json
+        run: echo ${{ secrets.TEST_WORKSPACE_CONFIG_JSON }} > ${{ github.workspace }}/examples/config.json
+      - name: Prepare requirements
+        working-directory: examples/tutorials/e2e-development
+        run: |
+          if [[ -e requirements.txt ]]; then
+            python -m pip install --upgrade pip
+            pip install -r requirements.txt
+          fi
+      - name: Prepare dev requirements
+        working-directory: examples
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r dev_requirements.txt
+      - name: Refine .env file
+        working-directory: examples/tutorials/e2e-development
+        run: |
+          AOAI_API_KEY=${{ secrets.AOAI_API_KEY_TEST }}
+          AOAI_API_ENDPOINT=${{ secrets.AOAI_API_ENDPOINT_TEST }}
+          AOAI_API_ENDPOINT=$(echo ${AOAI_API_ENDPOINT//\//\\/})
+          if [[ -e .env.example ]]; then
+            echo "env replacement"
+            sed -i -e "s/<your_AOAI_key>/$AOAI_API_KEY/g" -e "s/<your_AOAI_endpoint>/$AOAI_API_ENDPOINT/g" .env.example
+            mv .env.example .env
+          fi
+      - name: Create AOAI Connection from ENV file
+        working-directory: examples/tutorials/e2e-development
+        run: |
+          if [[ -e .env ]]; then
+            pf connection create --file .env --name chat_with_pdf_custom_connection
+            pf connection list
+          fi
+          
+      - name: Create run.yml
+        working-directory: examples/tutorials/e2e-development
+        run: |
+          gpt_base=${{ secrets.AOAI_API_ENDPOINT_TEST }}
+          gpt_base=$(echo ${gpt_base//\//\\/})
+          if [[ -e run.yml ]]; then
+            sed -i -e "s/\${azure_open_ai_connection.api_key}/${{ secrets.AOAI_API_KEY_TEST }}/g" -e "s/\${azure_open_ai_connection.api_base}/$gpt_base/g" run.yml
+          fi
+      - name: Azure Login
+        uses: azure/login@v1
+        with:
+          creds: ${{ secrets.AZURE_CREDENTIALS }}
+      - name: Extract Steps examples/tutorials/e2e-development/chat-with-pdf.md
+        working-directory: ${{ github.workspace }}
+        run: |
+          python scripts/readme/extract_steps_from_readme.py -f examples/tutorials/e2e-development/chat-with-pdf.md -o examples/tutorials/e2e-development
+      - name: Cat script
+        working-directory: examples/tutorials/e2e-development
+        run: |
+          cat bash_script.sh
+      - name: Run scripts
+        working-directory: examples/tutorials/e2e-development
+        run: |
+          export aoai_api_key=${{secrets.AOAI_API_KEY_TEST }}
+          export aoai_api_endpoint=${{ secrets.AOAI_API_ENDPOINT_TEST }}
+          export test_workspace_sub_id=${{ secrets.TEST_WORKSPACE_SUB_ID }}
+          export test_workspace_rg=${{ secrets.TEST_WORKSPACE_RG }}
+          export test_workspace_name=${{ secrets.TEST_WORKSPACE_NAME }}
+          bash bash_script.sh
+      - name: Upload artifact
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: artifact
+          path: examples/tutorials/e2e-development/bash_script.sh
\ No newline at end of file
diff --git a/examples/README.md b/examples/README.md
index 91d5e3f6fa5..a98bb8c1930 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -22,6 +22,13 @@
 
 ## CLI examples
 
+### Tutorials ([tutorials](tutorials))
+
+| path | status | description |
+------|--------|-------------
+| [chat-with-pdf](tutorials/e2e-development/chat-with-pdf.md) | [![samples_tutorials_e2e_development_chat_with_pdf](https://github.com/microsoft/promptflow/actions/workflows/samples_tutorials_e2e_development_chat_with_pdf.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_tutorials_e2e_development_chat_with_pdf.yml) |  In this tutorial, we will create a simple flow that mimic the functionality of retrieval of relevant information from the PDF to generate an answer with GPT |
+
+
 ### Flows ([flows](flows))
 
 #### [Standard flows](flows/standard/) 
@@ -34,6 +41,7 @@
 | [flow-with-additional-includes](flows/standard/flow-with-additional-includes/README.md) | [![samples_flows_standard_flow_with_additional_includes](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_standard_flow_with_additional_includes.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_standard_flow_with_additional_includes.yml) |  User sometimes need to reference some common files or folders, this sample demos how to solve the problem using additional_includes |
 | [flow-with-symlinks](flows/standard/flow-with-symlinks/README.md) | [![samples_flows_standard_flow_with_symlinks](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_standard_flow_with_symlinks.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_standard_flow_with_symlinks.yml) |  User sometimes need to reference some common files or folders, this sample demos how to solve the problem using symlinks |
 | [intent-copilot](flows/standard/intent-copilot/README.md) | [![samples_flows_standard_intent_copilot](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_standard_intent_copilot.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_standard_intent_copilot.yml) |  This example shows how to create a flow from existing langchain code |
+| [named-entity-recognition](flows/standard/named-entity-recognition/README.md) | [![samples_flows_standard_named_entity_recognition](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_standard_named_entity_recognition.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_standard_named_entity_recognition.yml) |  A flow that perform named entity recognition task |
 | [summarizing-film-with-autogpt](flows/standard/summarizing-film-with-autogpt/README.md) | [![samples_flows_standard_summarizing_film_with_autogpt](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_standard_summarizing_film_with_autogpt.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_standard_summarizing_film_with_autogpt.yml) | This is a flow showcasing how to construct a AutoGPT flow to autonomously figures out how to apply the given functionsto solve the goal, which is film trivia that provides accurate and up-to-date information about movies, directors, actors, and more in this sample |
 | [web-classification](flows/standard/web-classification/README.md) | [![samples_flows_standard_web_classification](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_standard_web_classification.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_standard_web_classification.yml) |  This is a flow demonstrating multi-class classification with LLM |
 
@@ -44,6 +52,9 @@
 ------|--------|-------------
 | [basic-eval](flows/evaluation/basic-eval/README.md) | [![samples_flows_evaluation_basic_eval](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_basic_eval.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_basic_eval.yml) |  This example shows how to create a basic evaluation flow |
 | [classification-accuracy-eval](flows/evaluation/classification-accuracy-eval/README.md) | [![samples_flows_evaluation_classification_accuracy_eval](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_classification_accuracy_eval.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_classification_accuracy_eval.yml) |  This is a flow illustrating how to evaluate the performance of a classification system |
+| [eval-entity-match-rate](flows/evaluation/eval-entity-match-rate/README.md) | [![samples_flows_evaluation_eval_entity_match_rate](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_entity_match_rate.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_entity_match_rate.yml) |  This is a flow evaluates: entity match rate |
+| [groundedness-eval](flows/evaluation/groundedness-eval/README.md) | [![samples_flows_evaluation_groundedness_eval](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_groundedness_eval.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_groundedness_eval.yml) |  This is a flow leverage llm to eval groundedness: whether answer is stating facts that are all present in the given context |
+| [perceived-intelligence-eval](flows/evaluation/perceived-intelligence-eval/README.md) | [![samples_flows_evaluation_perceived_intelligence_eval](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_perceived_intelligence_eval.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_perceived_intelligence_eval.yml) |  This is a flow leverage llm to eval percieved intelligence |
 
 
 #### [Chat flows](flows/chat/)
diff --git a/examples/dev_requirements.txt b/examples/dev_requirements.txt
index 556e574f79a..026b3ba448a 100644
--- a/examples/dev_requirements.txt
+++ b/examples/dev_requirements.txt
@@ -6,4 +6,5 @@ keyrings.alt
 black-nb
 pypandoc # for markdown reader
 pypandoc_binary # pypandoc pandoc backend
-panflute # for pandoc filters
\ No newline at end of file
+panflute # for pandoc filters
+jinja2 # for readme generations
\ No newline at end of file
diff --git a/examples/flows/chat/chat-with-pdf/.promptflow/flow.tools.json b/examples/flows/chat/chat-with-pdf/.promptflow/flow.tools.json
index 0f808b74a26..53f9b743e63 100644
--- a/examples/flows/chat/chat-with-pdf/.promptflow/flow.tools.json
+++ b/examples/flows/chat/chat-with-pdf/.promptflow/flow.tools.json
@@ -1,44 +1,336 @@
 {
-  "package": {},
-  "code": {
-    "setup_env.py": {
-      "type": "python",
-      "inputs": {
-        "conn": {
-          "type": [
-            "CustomConnection"
-          ]
+    "package": {
+        "promptflow.tools.azure_content_safety.AzureContentSafety.analyze_text": {
+            "class_name": "AzureContentSafety",
+            "function": "analyze_text",
+            "inputs": {
+                "connection": {
+                    "type": [
+                        "AzureContentSafetyConnection"
+                    ]
+                },
+                "hate_category": {
+                    "default": "medium_sensitivity",
+                    "enum": [
+                        "disable",
+                        "low_sensitivity",
+                        "medium_sensitivity",
+                        "high_sensitivity"
+                    ],
+                    "type": [
+                        "string"
+                    ]
+                },
+                "self_harm_category": {
+                    "default": "medium_sensitivity",
+                    "enum": [
+                        "disable",
+                        "low_sensitivity",
+                        "medium_sensitivity",
+                        "high_sensitivity"
+                    ],
+                    "type": [
+                        "string"
+                    ]
+                },
+                "sexual_category": {
+                    "default": "medium_sensitivity",
+                    "enum": [
+                        "disable",
+                        "low_sensitivity",
+                        "medium_sensitivity",
+                        "high_sensitivity"
+                    ],
+                    "type": [
+                        "string"
+                    ]
+                },
+                "text": {
+                    "type": [
+                        "string"
+                    ]
+                },
+                "violence_category": {
+                    "default": "medium_sensitivity",
+                    "enum": [
+                        "disable",
+                        "low_sensitivity",
+                        "medium_sensitivity",
+                        "high_sensitivity"
+                    ],
+                    "type": [
+                        "string"
+                    ]
+                }
+            },
+            "module": "promptflow.tools.azure_content_safety",
+            "name": "Content Safety (Text)",
+            "description": "Use Azure Content Safety to detect harmful content.",
+            "type": "python",
+            "package": "promptflow-tools",
+            "package_version": "0.1.0b3"
+        },
+        "promptflow.tools.azure_language_detector.get_language": {
+            "name": "Azure Language Detector",
+            "description": "Detect the language of the input text.",
+            "type": "python",
+            "module": "promptflow.tools.azure_language_detector",
+            "function": "get_language",
+            "inputs": {
+                "connection": {
+                    "type": [
+                        "CustomConnection"
+                    ]
+                },
+                "input_text": {
+                    "type": [
+                        "string"
+                    ]
+                }
+            },
+            "package": "promptflow-tools",
+            "package_version": "0.1.0b3"
+        },
+        "promptflow.tools.azure_translator.get_translation": {
+            "name": "Azure Translator",
+            "description": "Use Azure Translator API for translating text between 130+ languages.",
+            "type": "python",
+            "module": "promptflow.tools.azure_translator",
+            "function": "get_translation",
+            "inputs": {
+                "connection": {
+                    "type": [
+                        "CustomConnection"
+                    ]
+                },
+                "input_text": {
+                    "type": [
+                        "string"
+                    ]
+                },
+                "source_language": {
+                    "type": [
+                        "string"
+                    ]
+                },
+                "target_language": {
+                    "default": "en",
+                    "type": [
+                        "string"
+                    ]
+                }
+            },
+            "package": "promptflow-tools",
+            "package_version": "0.1.0b3"
+        },
+        "promptflow.tools.embedding.embedding": {
+            "name": "Embedding",
+            "description": "Use Open AI's embedding model to create an embedding vector representing the input text.",
+            "type": "python",
+            "module": "promptflow.tools.embedding",
+            "function": "embedding",
+            "inputs": {
+                "connection": {
+                    "type": [
+                        "AzureOpenAIConnection",
+                        "OpenAIConnection"
+                    ]
+                },
+                "deployment_name": {
+                    "type": [
+                        "string"
+                    ],
+                    "enabled_by": "connection",
+                    "enabled_by_type": [
+                        "AzureOpenAIConnection"
+                    ],
+                    "capabilities": {
+                        "completion": false,
+                        "chat_completion": false,
+                        "embeddings": true
+                    },
+                    "model_list": [
+                        "text-embedding-ada-002",
+                        "text-search-ada-doc-001",
+                        "text-search-ada-query-001"
+                    ]
+                },
+                "model": {
+                    "type": [
+                        "string"
+                    ],
+                    "enabled_by": "connection",
+                    "enabled_by_type": [
+                        "OpenAIConnection"
+                    ],
+                    "enum": [
+                        "text-embedding-ada-002",
+                        "text-search-ada-doc-001",
+                        "text-search-ada-query-001"
+                    ]
+                },
+                "input": {
+                    "type": [
+                        "string"
+                    ]
+                }
+            },
+            "package": "promptflow-tools",
+            "package_version": "0.1.0b3"
+        },
+        "promptflow.tools.serpapi.SerpAPI.search": {
+            "name": "Serp API",
+            "description": "Use Serp API to obtain search results from a specific search engine.",
+            "inputs": {
+                "connection": {
+                    "type": [
+                        "SerpConnection"
+                    ]
+                },
+                "engine": {
+                    "default": "google",
+                    "enum": [
+                        "google",
+                        "bing"
+                    ],
+                    "type": [
+                        "string"
+                    ]
+                },
+                "location": {
+                    "default": "",
+                    "type": [
+                        "string"
+                    ]
+                },
+                "num": {
+                    "default": "10",
+                    "type": [
+                        "int"
+                    ]
+                },
+                "query": {
+                    "type": [
+                        "string"
+                    ]
+                },
+                "safe": {
+                    "default": "off",
+                    "enum": [
+                        "active",
+                        "off"
+                    ],
+                    "type": [
+                        "string"
+                    ]
+                }
+            },
+            "type": "python",
+            "module": "promptflow.tools.serpapi",
+            "class_name": "SerpAPI",
+            "function": "search",
+            "package": "promptflow-tools",
+            "package_version": "0.1.0b3"
         }
-      },
-      "source": "setup_env.py",
-      "function": "setup_env"
     },
-    "chat_with_pdf_tool.py": {
-      "type": "python",
-      "inputs": {
-        "question": {
-          "type": [
-            "string"
-          ]
-        },
-        "pdf_url": {
-          "type": [
-            "string"
-          ]
-        },
-        "history": {
-          "type": [
-            "list"
-          ]
-        },
-        "ready": {
-          "type": [
-            "string"
-          ]
+    "code": {
+        "setup_env.py": {
+            "type": "python",
+            "inputs": {
+                "aoai_connection": {
+                    "type": [
+                        "AzureOpenAIConnection",
+                        "CustomConnection"
+                    ]
+                },
+                "config": {
+                    "type": [
+                        "object"
+                    ]
+                }
+            },
+            "source": "setup_env.py",
+            "function": "setup_env"
+        },
+        "download_tool.py": {
+            "type": "python",
+            "inputs": {
+                "url": {
+                    "type": [
+                        "string"
+                    ]
+                },
+                "env_ready_signal": {
+                    "type": [
+                        "string"
+                    ]
+                }
+            },
+            "source": "download_tool.py",
+            "function": "download_tool"
+        },
+        "build_index_tool.py": {
+            "type": "python",
+            "inputs": {
+                "pdf_path": {
+                    "type": [
+                        "string"
+                    ]
+                }
+            },
+            "source": "build_index_tool.py",
+            "function": "build_index_tool"
+        },
+        "find_context_tool.py": {
+            "type": "python",
+            "inputs": {
+                "question": {
+                    "type": [
+                        "string"
+                    ]
+                },
+                "index_path": {
+                    "type": [
+                        "string"
+                    ]
+                }
+            },
+            "source": "find_context_tool.py",
+            "function": "find_context_tool"
+        },
+        "qna_tool.py": {
+            "type": "python",
+            "inputs": {
+                "prompt": {
+                    "type": [
+                        "string"
+                    ]
+                },
+                "history": {
+                    "type": [
+                        "list"
+                    ]
+                }
+            },
+            "source": "qna_tool.py",
+            "function": "qna_tool"
+        },
+        "rewrite_question_tool.py": {
+            "type": "python",
+            "inputs": {
+                "question": {
+                    "type": [
+                        "string"
+                    ]
+                },
+                "history": {
+                    "type": [
+                        "list"
+                    ]
+                }
+            },
+            "source": "rewrite_question_tool.py",
+            "function": "rewrite_question_tool"
         }
-      },
-      "source": "chat_with_pdf_tool.py",
-      "function": "chat_with_pdf_tool"
     }
-  }
 }
\ No newline at end of file
diff --git a/examples/flows/chat/chat-with-pdf/README.md b/examples/flows/chat/chat-with-pdf/README.md
index 4c29709d83f..addbb89955d 100644
--- a/examples/flows/chat/chat-with-pdf/README.md
+++ b/examples/flows/chat/chat-with-pdf/README.md
@@ -5,10 +5,11 @@ You can run the flow with a URL to a PDF file and question as argument.
 Once it's launched it will download the PDF and build an index of the content. 
 Then when you ask a question, it will look up the index to retrieve relevant content and post the question with the relevant content to OpenAI chat model (gpt-3.5-turbo or gpt4) to get an answer.
 
+Learn more on corresponding [tutorials](../../../tutorials/e2e-development/chat-with-pdf.md).
+
 Tools used in this flow：
 - custom `python` Tool
 
-
 ## Prerequisites
 
 Install promptflow sdk and other dependencies:
@@ -17,34 +18,17 @@ pip install -r requirements.txt
 ```
 
 ## Get started
-### Create .env file in this folder with below content
-```
-OPENAI_API_BASE=<AOAI_endpoint>
-OPENAI_API_KEY=<AOAI_key>
-OPENAI_API_VERSION=2023-03-15-preview
-EMBEDDING_MODEL_DEPLOYMENT_NAME=text-embedding-ada-002
-CHAT_MODEL_DEPLOYMENT_NAME=gpt-35-turbo
-PROMPT_TOKEN_LIMIT=3000
-MAX_COMPLETION_TOKENS=256
-VERBOSE=false
-CHUNK_SIZE=512
-CHUNK_OVERLAP=0
-```
-Note: CHAT_MODEL_DEPLOYMENT_NAME should point to a chat model like gpt-3.5-turbo or gpt-4
+### Create connection in this folder
 
 ```bash
-if pf connection list | grep chat_with_pdf_custom_connection; then
-    echo "chat_with_pdf_custom_connection already exists"
+# create connection needed by flow
+if pf connection list | grep azure_open_ai_connection; then
+    echo "azure_open_ai_connection already exists"
 else
-    pf connection create --file .env --name chat_with_pdf_custom_connection
+    pf connection create --file azure_openai.yml --name azure_open_ai_connection --set api_key=<your_api_key> api_base=<your_api_base>
 fi
 ```
 
-### Turn into a prompt flow with just 1 node [flow.dag.yaml](flow.dag.yaml)
-Small changes to the code base to turn it into a single-node prompt flow. The benefit is to get a streamlined test, evaluate, debugging, deployment experience.
-![single-node-flow](assets/single-node-flow.png)
-![batch-run](assets/batch-run.gif)
-
 ### CLI Example
 
 ```bash
@@ -52,17 +36,21 @@ Small changes to the code base to turn it into a single-node prompt flow. The be
 pf flow test --flow .
 
 # test with flow inputs
-pf flow test --flow . --inputs question="What is the name of the new language representation model introduced in the document?" pdf_url="https://arxiv.org/pdf/1810.04805.pdf"
+pf flow test --flow . --inputs question="What is the name of the new language representation model introduced in the document?" pdf_url="https://arxiv.org/pdf/1810.04805.pdf" config="{ \
+\"EMBEDDING_MODEL_DEPLOYMENT_NAME\": \"text-embedding-ada-002\", \
+\"CHAT_MODEL_DEPLOYMENT_NAME\": \"gpt-35-turbo\", \
+\"PROMPT_TOKEN_LIMIT\": \"2000\", \
+\"MAX_COMPLETION_TOKENS\": \"256\", \
+\"VERBOSE\": \"True\", \
+\"CHUNK_SIZE\": \"256\", \
+\"CHUNK_OVERLAP\": \"32\" }"
 
 # (Optional) create a random run name
 run_name="web_classification_"$(openssl rand -hex 12)
 
 # run with multiline data, --name is optional
-pf run create --flow . --data ./data/bert-paper-qna-3-line.jsonl --stream --name $run_name
+pf run create --flow . --data ./data/bert-paper-qna-3-line.jsonl --stream --name $run_name --column-mapping question='${data.question}' pdf_url='${data.pdf_url}' chat_history='${data.chat_history}' config="{ \"EMBEDDING_MODEL_DEPLOYMENT_NAME\": \"text-embedding-ada-002\", \"CHAT_MODEL_DEPLOYMENT_NAME\": \"gpt-35-turbo\", \"PROMPT_TOKEN_LIMIT\": \"2000\", \"MAX_COMPLETION_TOKENS\": \"256\", \"VERBOSE\": \"True\", \"CHUNK_SIZE\": \"256\", \"CHUNK_OVERLAP\": \"32\" }"
 
 # visualize run output details
 pf run visualize --name $run_name
 ```
-
-### SDK Example: Run [pf.ipynb](pf.ipynb)
-This is using prompt flow SDK to do evaluation for chat_with_pdf flow, using a test data with 10 questions.
diff --git a/examples/flows/chat/chat-with-pdf/__init__.py b/examples/flows/chat/chat-with-pdf/__init__.py
new file mode 100644
index 00000000000..9d43870295c
--- /dev/null
+++ b/examples/flows/chat/chat-with-pdf/__init__.py
@@ -0,0 +1,6 @@
+import sys
+import os
+
+sys.path.append(
+    os.path.join(os.path.dirname(os.path.abspath(__file__)), "chat_with_pdf")
+)
diff --git a/examples/flows/chat/chat-with-pdf/assets/chat_with_pdf_simple.png b/examples/flows/chat/chat-with-pdf/assets/chat_with_pdf_simple.png
new file mode 100644
index 00000000000..80d5dc00a51
Binary files /dev/null and b/examples/flows/chat/chat-with-pdf/assets/chat_with_pdf_simple.png differ
diff --git a/examples/flows/chat/chat-with-pdf/assets/chat_with_pdf_with_rewrite.png b/examples/flows/chat/chat-with-pdf/assets/chat_with_pdf_with_rewrite.png
new file mode 100644
index 00000000000..f7e529c1c54
Binary files /dev/null and b/examples/flows/chat/chat-with-pdf/assets/chat_with_pdf_with_rewrite.png differ
diff --git a/examples/flows/chat/chat-with-pdf/assets/chatbot_console.gif b/examples/flows/chat/chat-with-pdf/assets/chatbot_console.gif
new file mode 100644
index 00000000000..c9d6c3c4cc5
Binary files /dev/null and b/examples/flows/chat/chat-with-pdf/assets/chatbot_console.gif differ
diff --git a/examples/flows/chat/chat-with-pdf/assets/multi-node-flow-chat-with-pdf.png b/examples/flows/chat/chat-with-pdf/assets/multi-node-flow-chat-with-pdf.png
new file mode 100644
index 00000000000..c3b3466eb56
Binary files /dev/null and b/examples/flows/chat/chat-with-pdf/assets/multi-node-flow-chat-with-pdf.png differ
diff --git a/examples/flows/chat/chat-with-pdf/azure_openai.yml b/examples/flows/chat/chat-with-pdf/azure_openai.yml
new file mode 100644
index 00000000000..5646047bebe
--- /dev/null
+++ b/examples/flows/chat/chat-with-pdf/azure_openai.yml
@@ -0,0 +1,7 @@
+$schema: https://azuremlschemas.azureedge.net/promptflow/latest/AzureOpenAIConnection.schema.json
+name: azure_open_ai_connection
+type: azure_open_ai
+api_key: "<to-be-replaced>"
+api_base: "aoai-api-endpoint"
+api_type: "azure"
+api_version: "2023-03-15-preview"
diff --git a/examples/flows/chat/chat-with-pdf/batch_run.yaml b/examples/flows/chat/chat-with-pdf/batch_run.yaml
new file mode 100644
index 00000000000..6a83d4b0809
--- /dev/null
+++ b/examples/flows/chat/chat-with-pdf/batch_run.yaml
@@ -0,0 +1,17 @@
+$schema: https://azuremlschemas.azureedge.net/promptflow/latest/Run.schema.json
+#name: chat_with_pdf_default_20230820_162219_559000
+flow: .
+data: ./data/bert-paper-qna.jsonl
+#run: <Uncomment to select a run input>
+column_mapping:
+  chat_history: ${data.chat_history}
+  pdf_url: ${data.pdf_url}
+  question: ${data.question}
+  config: 
+    EMBEDDING_MODEL_DEPLOYMENT_NAME: text-embedding-ada-002
+    CHAT_MODEL_DEPLOYMENT_NAME: gpt-35-turbo
+    PROMPT_TOKEN_LIMIT: 3000
+    MAX_COMPLETION_TOKENS: 256
+    VERBOSE: true
+    CHUNK_SIZE: 1024
+    CHUNK_OVERLAP: 64
\ No newline at end of file
diff --git a/examples/flows/chat/chat-with-pdf/build_index_tool.py b/examples/flows/chat/chat-with-pdf/build_index_tool.py
index d24b05d3f96..abbff62e7ff 100644
--- a/examples/flows/chat/chat-with-pdf/build_index_tool.py
+++ b/examples/flows/chat/chat-with-pdf/build_index_tool.py
@@ -1,11 +1,4 @@
-# flake8: noqa: E402
-import os
-import sys
-
 from promptflow import tool
-
-# append chat_with_pdf to sys.path so code inside it can discover its modules
-sys.path.append(f"{os.path.dirname(__file__)}/chat_with_pdf")
 from chat_with_pdf.build_index import create_faiss_index
 
 
diff --git a/examples/flows/chat/chat-with-pdf/chat-with-pdf-azure.ipynb b/examples/flows/chat/chat-with-pdf/chat-with-pdf-azure.ipynb
index 700dcfa511d..e6cad9206c7 100644
--- a/examples/flows/chat/chat-with-pdf/chat-with-pdf-azure.ipynb
+++ b/examples/flows/chat/chat-with-pdf/chat-with-pdf-azure.ipynb
@@ -66,7 +66,9 @@
    "source": [
     "import promptflow.azure as azure\n",
     "\n",
-    "pf = azure.PFClient(ml_client)"
+    "pf = azure.PFClient(ml_client)\n",
+    "runtime = 'chat_with_pdf_runtime'\n",
+    "# runtime = None # serverless"
    ]
   },
   {
@@ -76,26 +78,6 @@
     "## 1.2 Create necessary connections\n",
     "\n",
     "Connection in prompt flow is for managing settings of your application behaviors incl. how to talk to different services (Azure OpenAI for example).\n",
-    "In many applications, configuration files or environment variables are used for this purpose. Chat_with_pdf also uses environment variables, to make it work with prompt flow and without changing how environment variables are used, we populate everything in the CustomConnection into environment variables.\n",
-    "```python\n",
-    "def setup_env(conn: CustomConnection):\n",
-    "    if not conn:\n",
-    "        return\n",
-    "    for key in conn:\n",
-    "        os.environ[key] = conn[key]\n",
-    "```\n",
-    "\n",
-    "chat_with_pdf requires following env vars (thus for the custom connection named `chat_with_pdf_custom_connection`):\n",
-    "```\n",
-    "OPENAI_API_BASE=<AOAI_ENDPOINT>\n",
-    "OPENAI_API_VERSION=2023-03-15-preview\n",
-    "OPENAI_API_KEY=<AOAI_API_KEY>\n",
-    "EMBEDDING_MODEL_DEPLOYMENT_NAME=text-embedding-ada-002\n",
-    "CHAT_MODEL_DEPLOYMENT_NAME=gpt-35-turbo\n",
-    "PROMPT_TOKEN_LIMIT=3000\n",
-    "MAX_COMPLETION_TOKENS=256\n",
-    "```\n",
-    "\n",
     "\n",
     "Prepare your Azure Open AI resource follow this [instruction](https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal) and get your `api_key` if you don't have one.\n",
     "\n",
@@ -109,25 +91,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from promptflow.entities import CustomConnection\n",
-    "\n",
-    "conn_name = \"chat_with_pdf_custom_connection\"\n",
-    "\n",
-    "conn = CustomConnection(\n",
-    "    name=conn_name,\n",
-    "    configs={\n",
-    "        \"OPENAI_API_VERSION\": \"2023-03-15-preview\",\n",
-    "        \"EMBEDDING_MODEL_DEPLOYMENT_NAME\": \"text-embedding-ada-002\",\n",
-    "        \"CHAT_MODEL_DEPLOYMENT_NAME\": \"gpt-35-turbo\",\n",
-    "        \"PROMPT_TOKEN_LIMIT\": \"3000\",\n",
-    "        \"MAX_COMPLETION_TOKENS\": \"256\",\n",
-    "    },\n",
-    "    secrets={\n",
-    "        \"OPENAI_API_BASE\": \"AOAI_ENDPOINT\",  # replace this\n",
-    "        \"OPENAI_API_KEY\": \"AOAI_API_KEY\",  # replace this\n",
-    "    },\n",
-    ")\n",
+    "conn_name = \"azure_open_ai_connection\"\n",
     "\n",
+    "# TODO integrate with azure.ai sdk\n",
     "# currently we only support create connection in Azure ML Studio UI\n",
     "# raise Exception(f\"Please create {conn_name} connection in Azure ML Studio.\")"
    ]
@@ -148,10 +114,28 @@
     "flow_path = \".\"\n",
     "data_path = \"./data/bert-paper-qna.jsonl\"\n",
     "\n",
+    "config_3k_context = {\n",
+    "    \"EMBEDDING_MODEL_DEPLOYMENT_NAME\": \"text-embedding-ada-002\",\n",
+    "    \"CHAT_MODEL_DEPLOYMENT_NAME\": \"gpt-35-turbo\",\n",
+    "    \"PROMPT_TOKEN_LIMIT\": 2000,\n",
+    "    \"MAX_COMPLETION_TOKENS\": 256,\n",
+    "    \"VERBOSE\": True,\n",
+    "    \"CHUNK_SIZE\": 256,\n",
+    "    \"CHUNK_OVERLAP\": 32,\n",
+    "}\n",
+    "\n",
+    "column_mapping={\n",
+    "    \"question\": \"${data.question}\",\n",
+    "    \"pdf_url\": \"${data.pdf_url}\",\n",
+    "    \"chat_history\": \"${data.chat_history}\",\n",
+    "    \"config\": config_3k_context,\n",
+    "}\n",
+    "\n",
     "run_3k_context = pf.run(\n",
     "    flow=flow_path,\n",
     "    data=data_path,\n",
-    "    connections={\"setup_env\": {\"conn\": \"chat_with_pdf_custom_connection\"}},\n",
+    "    column_mapping=column_mapping,\n",
+    "    runtime=runtime,\n",
     "    display_name=\"chat_with_pdf_3k_context\",\n",
     "    tags={\"chat_with_pdf\": \"\", \"2nd_round\": \"\"},\n",
     ")\n",
@@ -199,12 +183,30 @@
     "flow_path = \".\"\n",
     "data_path = \"./data/bert-paper-qna.jsonl\"\n",
     "\n",
+    "config_2k_context = {\n",
+    "    \"EMBEDDING_MODEL_DEPLOYMENT_NAME\": \"text-embedding-ada-002\",\n",
+    "    \"CHAT_MODEL_DEPLOYMENT_NAME\": \"gpt-35-turbo\",\n",
+    "    \"PROMPT_TOKEN_LIMIT\": 2000,\n",
+    "    \"MAX_COMPLETION_TOKENS\": 256,\n",
+    "    \"VERBOSE\": True,\n",
+    "    \"CHUNK_SIZE\": 256,\n",
+    "    \"CHUNK_OVERLAP\": 32,\n",
+    "}\n",
+    "\n",
+    "column_mapping={\n",
+    "    \"question\": \"${data.question}\",\n",
+    "    \"pdf_url\": \"${data.pdf_url}\",\n",
+    "    \"chat_history\": \"${data.chat_history}\",\n",
+    "    \"config\": config_2k_context,\n",
+    "}\n",
     "run_2k_context = pf.run(\n",
     "    flow=flow_path,\n",
     "    data=data_path,\n",
+    "    column_mapping=column_mapping,\n",
     "    connections={\n",
     "        \"setup_env\": {\"conn\": \"chat_with_pdf_custom_connection_smaller_context\"}\n",
     "    },\n",
+    "    runtime=runtime,\n",
     "    display_name=\"chat_with_pdf_2k_context\",\n",
     "    tags={\"chat_with_pdf\": \"\", \"2nd_round\": \"\"},\n",
     ")\n",
diff --git a/examples/flows/chat/chat-with-pdf/chat-with-pdf.ipynb b/examples/flows/chat/chat-with-pdf/chat-with-pdf.ipynb
index 8dd11d00e4b..6641300b4a3 100644
--- a/examples/flows/chat/chat-with-pdf/chat-with-pdf.ipynb
+++ b/examples/flows/chat/chat-with-pdf/chat-with-pdf.ipynb
@@ -30,26 +30,7 @@
    "metadata": {},
    "source": [
     "## 1. Create connections\n",
-    "Connection in prompt flow is for managing settings of your application behaviors incl. how to talk to different services (Azure OpenAI for example).\n",
-    "In many applications, configuration files or environment variables are used for this purpose. Chat_with_pdf also uses environment variables, to make it work with prompt flow and without changing how environment variables are used, we populate everything in the CustomConnection into environment variables.\n",
-    "```python\n",
-    "def setup_env(conn: CustomConnection):\n",
-    "    if not conn:\n",
-    "        return\n",
-    "    for key in conn:\n",
-    "        os.environ[key] = conn[key]\n",
-    "```\n",
-    "\n",
-    "chat_with_pdf requires following env vars (thus for the custom connection named \"chat_with_pdf_custom_connection\"):\n",
-    "```\n",
-    "OPENAI_API_BASE=<AOAI_ENDPOINT>\n",
-    "OPENAI_API_VERSION=2023-03-15-preview\n",
-    "OPENAI_API_KEY=<AOAI_API_KEY>\n",
-    "EMBEDDING_MODEL_DEPLOYMENT_NAME=text-embedding-ada-002\n",
-    "CHAT_MODEL_DEPLOYMENT_NAME=gpt-35-turbo\n",
-    "PROMPT_TOKEN_LIMIT=3000\n",
-    "MAX_COMPLETION_TOKENS=256\n",
-    "```"
+    "Connection in prompt flow is for managing settings of your application behaviors incl. how to talk to different services (Azure OpenAI for example)."
    ]
   },
   {
@@ -73,31 +54,27 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from promptflow.entities import CustomConnection\n",
-    "\n",
-    "conn_name = \"chat_with_pdf_custom_connection\"\n",
+    "# create needed connection\n",
+    "from promptflow.entities import AzureOpenAIConnection\n",
     "\n",
-    "if len([c for c in pf.connections.list() if c.name == conn_name]) == 0:\n",
-    "    # Create the custom connection that is required by chat_with_pdf_tool\n",
-    "    print(f\"Creating custom connection: {conn_name}\")\n",
-    "    conn = CustomConnection(\n",
+    "try:\n",
+    "    conn_name = \"azure_open_ai_connection\"\n",
+    "    conn = pf.connections.get(name=conn_name)\n",
+    "    print(\"using existing connection\")\n",
+    "except:\n",
+    "    # Follow https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/create-resource?pivots=web-portal to create an Azure Open AI resource.\n",
+    "    connection = AzureOpenAIConnection(\n",
     "        name=conn_name,\n",
-    "        configs={\n",
-    "            \"OPENAI_API_VERSION\": \"2023-03-15-preview\",\n",
-    "            \"EMBEDDING_MODEL_DEPLOYMENT_NAME\": \"text-embedding-ada-002\",\n",
-    "            \"CHAT_MODEL_DEPLOYMENT_NAME\": \"gpt-35-turbo\",\n",
-    "            \"PROMPT_TOKEN_LIMIT\": \"3000\",\n",
-    "            \"MAX_COMPLETION_TOKENS\": \"256\",\n",
-    "        },\n",
-    "        secrets={\n",
-    "            \"OPENAI_API_BASE\": \"AOAI_ENDPOINT\",  # replace this\n",
-    "            \"OPENAI_API_KEY\": \"AOAI_API_KEY\",  # replace this\n",
-    "        },\n",
+    "        api_key=\"<test_key>\",\n",
+    "        api_base=\"<test_base>\",\n",
+    "        api_type=\"azure\",\n",
+    "        api_version=\"<test_version>\",\n",
     "    )\n",
-    "    pf.connections.create_or_update(conn)\n",
-    "    print(f\"Custom connection: {conn_name} created.\")\n",
-    "else:\n",
-    "    print(f\"Custom connection: {conn_name} found.\")"
+    "\n",
+    "    conn = pf.connections.create_or_update(connection)\n",
+    "    print(\"successfully created connection\")\n",
+    "\n",
+    "print(conn)"
    ]
   },
   {
@@ -142,7 +119,23 @@
     "flow_path = \".\"\n",
     "data_path = \"./data/bert-paper-qna-3-line.jsonl\"\n",
     "\n",
-    "run = pf.run(flow=flow_path, data=data_path)\n",
+    "config_2k_context = {\n",
+    "    \"EMBEDDING_MODEL_DEPLOYMENT_NAME\": \"text-embedding-ada-002\",\n",
+    "    \"CHAT_MODEL_DEPLOYMENT_NAME\": \"gpt-35-turbo\",\n",
+    "    \"PROMPT_TOKEN_LIMIT\": 2000,\n",
+    "    \"MAX_COMPLETION_TOKENS\": 256,\n",
+    "    \"VERBOSE\": True,\n",
+    "    \"CHUNK_SIZE\": 256,\n",
+    "    \"CHUNK_OVERLAP\": 32,\n",
+    "}\n",
+    "\n",
+    "column_mapping = {\n",
+    "    \"question\": \"${data.question}\",\n",
+    "    \"pdf_url\": \"${data.pdf_url}\",\n",
+    "    \"chat_history\": \"${data.chat_history}\",\n",
+    "    \"config\": config_2k_context,\n",
+    "}\n",
+    "run = pf.run(flow=flow_path, data=data_path, column_mapping=column_mapping)\n",
     "pf.stream(run)\n",
     "\n",
     "print(run)"
diff --git a/examples/flows/chat/chat-with-pdf/chat_with_pdf/README.md b/examples/flows/chat/chat-with-pdf/chat_with_pdf/README.md
index 2af10621549..b5439519213 100644
--- a/examples/flows/chat/chat-with-pdf/chat_with_pdf/README.md
+++ b/examples/flows/chat/chat-with-pdf/chat_with_pdf/README.md
@@ -18,8 +18,8 @@ CHAT_MODEL_DEPLOYMENT_NAME=gpt-35-turbo
 PROMPT_TOKEN_LIMIT=3000
 MAX_COMPLETION_TOKENS=256
 VERBOSE=false
-CHUNK_SIZE=512
-CHUNK_OVERLAP=0
+CHUNK_SIZE=1024
+CHUNK_OVERLAP=64
 ```
 Note: CHAT_MODEL_DEPLOYMENT_NAME should point to a chat model like gpt-3.5-turbo or gpt-4
 ### Run the command line
diff --git a/examples/flows/chat/chat-with-pdf/chat_with_pdf/__init__.py b/examples/flows/chat/chat-with-pdf/chat_with_pdf/__init__.py
index d55ccad1f57..96a36c3a66e 100644
--- a/examples/flows/chat/chat-with-pdf/chat_with_pdf/__init__.py
+++ b/examples/flows/chat/chat-with-pdf/chat_with_pdf/__init__.py
@@ -1 +1,4 @@
-__path__ = __import__("pkgutil").extend_path(__path__, __name__)  # type: ignore
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
diff --git a/examples/flows/chat/chat-with-pdf/chat_with_pdf/build_index.py b/examples/flows/chat/chat-with-pdf/chat_with_pdf/build_index.py
index f7df836e6c5..fe6ec0dc622 100644
--- a/examples/flows/chat/chat-with-pdf/chat_with_pdf/build_index.py
+++ b/examples/flows/chat/chat-with-pdf/chat_with_pdf/build_index.py
@@ -3,7 +3,7 @@
 import os
 
 from utils.aoai import AOAIEmbedding
-from utils.langchain_faiss import LangChainFaissEngine
+from utils.index import FAISSIndex
 from utils.logging import log
 from utils.lock import acquire_lock
 
@@ -13,14 +13,17 @@ def create_faiss_index(pdf_path: str) -> str:
     lock_path = index_persistent_path + ".lock"
     log("Index path: " + os.path.abspath(index_persistent_path))
 
-    chunk_size = int(os.environ.get("CHUNK_SIZE", 1024))
-    chunk_overlap = int(os.environ.get("CHUNK_OVERLAP", 0))
+    chunk_size = int(os.environ.get("CHUNK_SIZE"))
+    chunk_overlap = int(os.environ.get("CHUNK_OVERLAP"))
     log(f"Chunk size: {chunk_size}, chunk overlap: {chunk_overlap}")
 
     with acquire_lock(lock_path):
-        if os.path.exists(index_persistent_path):
+        if os.path.exists(os.path.join(index_persistent_path, "index.faiss")):
             log("Index already exists, bypassing index creation")
             return index_persistent_path
+        else:
+            if not os.path.exists(index_persistent_path):
+                os.makedirs(index_persistent_path)
 
         log("Building index")
         pdf_reader = PyPDF2.PdfReader(pdf_path)
@@ -39,12 +42,10 @@ def create_faiss_index(pdf_path: str) -> str:
 
         log(f"Number of segments: {len(segments)}")
 
-        engine = LangChainFaissEngine(
-            index=faiss.IndexFlatL2(1536), embedding=AOAIEmbedding()
-        )
-        engine.batch_insert_texts(segments)
+        index = FAISSIndex(index=faiss.IndexFlatL2(1536), embedding=AOAIEmbedding())
+        index.insert_batch(segments)
 
-        engine.save_data_index_to_disk(index_persistent_path)
+        index.save(index_persistent_path)
 
         log("Index built: " + index_persistent_path)
         return index_persistent_path
diff --git a/examples/flows/chat/chat-with-pdf/chat_with_pdf/find_context.py b/examples/flows/chat/chat-with-pdf/chat_with_pdf/find_context.py
new file mode 100644
index 00000000000..60fe977893c
--- /dev/null
+++ b/examples/flows/chat/chat-with-pdf/chat_with_pdf/find_context.py
@@ -0,0 +1,31 @@
+import faiss
+from jinja2 import Environment, FileSystemLoader
+import os
+
+from utils.index import FAISSIndex
+from utils.aoai import AOAIEmbedding, render_with_token_limit
+from utils.logging import log
+
+
+def find_context(question: str, index_path: str):
+    index = FAISSIndex(index=faiss.IndexFlatL2(1536), embedding=AOAIEmbedding())
+    index.load(path=index_path)
+    snippets = index.query(question, top_k=5)
+
+    template = Environment(
+        loader=FileSystemLoader(os.path.dirname(os.path.abspath(__file__)))
+    ).get_template("qna_prompt.md")
+    token_limit = int(os.environ.get("PROMPT_TOKEN_LIMIT"))
+
+    # Try to render the template with token limit and reduce snippet count if it fails
+    while True:
+        try:
+            prompt = render_with_token_limit(
+                template, token_limit, question=question, context=enumerate(snippets)
+            )
+            break
+        except ValueError:
+            snippets = snippets[:-1]
+            log(f"Reducing snippet count to {len(snippets)} to fit token limit")
+
+    return prompt, snippets
diff --git a/examples/flows/chat/chat-with-pdf/chat_with_pdf/main.py b/examples/flows/chat/chat-with-pdf/chat_with_pdf/main.py
index d0272778145..dddbc77e8d9 100644
--- a/examples/flows/chat/chat-with-pdf/chat_with_pdf/main.py
+++ b/examples/flows/chat/chat-with-pdf/chat_with_pdf/main.py
@@ -1,11 +1,9 @@
-# flake8: noqa: E402
-
 import argparse
 from dotenv import load_dotenv
 import os
-import sys
 
 from qna import qna
+from find_context import find_context
 from rewrite_question import rewrite_question
 from build_index import create_faiss_index
 from download import download
@@ -13,7 +11,6 @@
 
 
 def chat_with_pdf(question: str, pdf_url: str, history: list):
-
     with acquire_lock("create_folder.lock"):
         if not os.path.exists(".pdfs"):
             os.mkdir(".pdfs")
@@ -23,7 +20,8 @@ def chat_with_pdf(question: str, pdf_url: str, history: list):
     pdf_path = download(pdf_url)
     index_path = create_faiss_index(pdf_path)
     q = rewrite_question(question, history)
-    stream, context = qna(q, index_path, history)
+    prompt, context = find_context(q, index_path)
+    stream = qna(prompt, history)
 
     return stream, context
 
@@ -38,12 +36,8 @@ def print_stream_and_return_full_answer(stream):
     return answer
 
 
-def main():
-    parser = argparse.ArgumentParser(description="Ask questions about a PDF file")
-    parser.add_argument("url", help="URL to the PDF file")
-    args = parser.parse_args()
-
-    load_dotenv()
+def main_loop(url: str):
+    load_dotenv(os.path.join(os.path.dirname(__file__), ".env"))
 
     history = []
     while True:
@@ -51,9 +45,9 @@ def main():
         if question == "q!":
             break
 
-        print("\033[92m" + "$Bot: " + "\033[0m", end=" ", flush=True)
-        stream, context = chat_with_pdf(question, args.url, history)
+        stream, context = chat_with_pdf(question, url, history)
 
+        print("\033[92m" + "$Bot: " + "\033[0m", end=" ", flush=True)
         answer = print_stream_and_return_full_answer(stream)
         history = history + [
             {"role": "user", "content": question},
@@ -61,5 +55,13 @@ def main():
         ]
 
 
+def main():
+    parser = argparse.ArgumentParser(description="Ask questions about a PDF file")
+    parser.add_argument("url", help="URL to the PDF file")
+    args = parser.parse_args()
+
+    main_loop(args.url)
+
+
 if __name__ == "__main__":
-    main()
+    main_loop("https://arxiv.org/pdf/1810.04805.pdf")
diff --git a/examples/flows/chat/chat-with-pdf/chat_with_pdf/qna.py b/examples/flows/chat/chat-with-pdf/chat_with_pdf/qna.py
index 1ee9b63b7e1..73565767ce6 100644
--- a/examples/flows/chat/chat-with-pdf/chat_with_pdf/qna.py
+++ b/examples/flows/chat/chat-with-pdf/chat_with_pdf/qna.py
@@ -1,41 +1,15 @@
-import faiss
-from jinja2 import Environment, FileSystemLoader
 import os
 
-from utils.langchain_faiss import LangChainFaissEngine
-from utils.aoai import AOAIEmbedding, AOAIChat, render_with_token_limit
-from utils.logging import log
+from utils.aoai import AOAIChat
 
 
-def qna(question: str, index_path: str, history: list):
-    engine = LangChainFaissEngine(
-        index=faiss.IndexFlatL2(1536), embedding=AOAIEmbedding()
-    )
-    engine.load_data_index_from_disk(path=index_path)
-    snippets = engine.search_by_text(question, top_k=5)
-
-    template = Environment(
-        loader=FileSystemLoader(os.path.dirname(os.path.abspath(__file__)))
-    ).get_template("qna_prompt.md")
-    token_limit = int(os.environ.get("PROMPT_TOKEN_LIMIT"))
+def qna(prompt: str, history: list):
     max_completion_tokens = int(os.environ.get("MAX_COMPLETION_TOKENS"))
 
-    # Try to render the template with token limit and reduce snippet count if it fails
-    while True:
-        try:
-            prompt = render_with_token_limit(
-                template, token_limit, question=question, context=enumerate(snippets)
-            )
-            break
-        except ValueError:
-            snippets = snippets[:-1]
-            log(f"Reducing snippet count to {len(snippets)} to fit token limit")
-
     chat = AOAIChat()
     stream = chat.stream(
         messages=history + [{"role": "user", "content": prompt}],
         max_tokens=max_completion_tokens,
     )
-    context = [s.text for s in snippets]
 
-    return stream, context
+    return stream
diff --git a/examples/flows/chat/chat-with-pdf/chat_with_pdf/rewrite_question.py b/examples/flows/chat/chat-with-pdf/chat_with_pdf/rewrite_question.py
index 44a81044de5..250cb879c47 100644
--- a/examples/flows/chat/chat-with-pdf/chat_with_pdf/rewrite_question.py
+++ b/examples/flows/chat/chat-with-pdf/chat_with_pdf/rewrite_question.py
@@ -1,7 +1,5 @@
 from jinja2 import Environment, FileSystemLoader
 import os
-
-
 from utils.logging import log
 from utils.aoai import AOAIChat, render_with_token_limit
 
diff --git a/examples/flows/chat/chat-with-pdf/chat_with_pdf/utils/index.py b/examples/flows/chat/chat-with-pdf/chat_with_pdf/utils/index.py
new file mode 100644
index 00000000000..0a64207f5ed
--- /dev/null
+++ b/examples/flows/chat/chat-with-pdf/chat_with_pdf/utils/index.py
@@ -0,0 +1,73 @@
+import os
+from typing import Iterable, List, Optional
+from dataclasses import dataclass
+from faiss import Index
+import faiss
+import pickle
+import numpy as np
+
+from .aoai import AOAIEmbedding as Embedding
+
+
+@dataclass
+class SearchResultEntity:
+    text: str = None
+    vector: List[float] = None
+    score: float = None
+    original_entity: dict = None
+    metadata: dict = None
+
+
+INDEX_FILE_NAME = "index.faiss"
+DATA_FILE_NAME = "index.pkl"
+
+
+class FAISSIndex:
+    def __init__(self, index: Index, embedding: Embedding) -> None:
+        self.index = index
+        self.docs = {}  # id -> doc, doc is (text, metadata)
+        self.embedding = embedding
+
+    def insert_batch(
+        self, texts: Iterable[str], metadatas: Optional[List[dict]] = None
+    ) -> None:
+        documents = []
+        vectors = []
+        for i, text in enumerate(texts):
+            metadata = metadatas[i] if metadatas else {}
+            vector = self.embedding.generate(text)
+            documents.append((text, metadata))
+            vectors.append(vector)
+
+        self.index.add(np.array(vectors, dtype=np.float32))
+        self.docs.update(
+            {i: doc for i, doc in enumerate(documents, start=len(self.docs))}
+        )
+
+        pass
+
+    def query(self, text: str, top_k: int = 10) -> List[SearchResultEntity]:
+        vector = self.embedding.generate(text)
+        scores, indices = self.index.search(np.array([vector], dtype=np.float32), top_k)
+        docs = []
+        for j, i in enumerate(indices[0]):
+            if i == -1:  # This happens when not enough docs are returned.
+                continue
+            doc = self.docs[i]
+            docs.append(
+                SearchResultEntity(text=doc[0], metadata=doc[1], score=scores[0][j])
+            )
+        return docs
+
+    def save(self, path: str) -> None:
+        faiss.write_index(self.index, os.path.join(path, INDEX_FILE_NAME))
+        # dump docs to pickle file
+        with open(os.path.join(path, DATA_FILE_NAME), "wb") as f:
+            pickle.dump(self.docs, f)
+        pass
+
+    def load(self, path: str) -> None:
+        self.index = faiss.read_index(os.path.join(path, INDEX_FILE_NAME))
+        with open(os.path.join(path, DATA_FILE_NAME), "rb") as f:
+            self.docs = pickle.load(f)
+        pass
diff --git a/examples/flows/chat/chat-with-pdf/chat_with_pdf/utils/logging.py b/examples/flows/chat/chat-with-pdf/chat_with_pdf/utils/logging.py
index b62c6785664..0122c0791f1 100644
--- a/examples/flows/chat/chat-with-pdf/chat_with_pdf/utils/logging.py
+++ b/examples/flows/chat/chat-with-pdf/chat_with_pdf/utils/logging.py
@@ -3,5 +3,5 @@
 
 def log(message: str):
     verbose = os.environ.get("VERBOSE")
-    if verbose == "true":
+    if verbose.lower() == "true":
         print(message, flush=True)
diff --git a/examples/flows/chat/chat-with-pdf/chat_with_pdf_tool.py b/examples/flows/chat/chat-with-pdf/chat_with_pdf_tool.py
index 27ed4d05071..334755322df 100644
--- a/examples/flows/chat/chat-with-pdf/chat_with_pdf_tool.py
+++ b/examples/flows/chat/chat-with-pdf/chat_with_pdf_tool.py
@@ -1,11 +1,4 @@
-# flake8: noqa: E402
-import os
-import sys
-
 from promptflow import tool
-
-# append chat_with_pdf to sys.path so code inside it can discover its modules
-sys.path.append(f"{os.path.dirname(__file__)}/chat_with_pdf")
 from chat_with_pdf.main import chat_with_pdf
 
 
diff --git a/examples/flows/chat/chat-with-pdf/download_tool.py b/examples/flows/chat/chat-with-pdf/download_tool.py
index 94ad5d9b459..72baa90fac5 100644
--- a/examples/flows/chat/chat-with-pdf/download_tool.py
+++ b/examples/flows/chat/chat-with-pdf/download_tool.py
@@ -1,10 +1,4 @@
-# flake8: noqa: E402
-import os
-import sys
 from promptflow import tool
-
-# append chat_with_pdf to sys.path so code inside it can discover its modules
-sys.path.append(f"{os.path.dirname(__file__)}/chat_with_pdf")
 from chat_with_pdf.download import download
 
 
diff --git a/examples/flows/chat/chat-with-pdf/eval_run.yaml b/examples/flows/chat/chat-with-pdf/eval_run.yaml
new file mode 100644
index 00000000000..ee01acef819
--- /dev/null
+++ b/examples/flows/chat/chat-with-pdf/eval_run.yaml
@@ -0,0 +1,8 @@
+$schema: https://azuremlschemas.azureedge.net/promptflow/latest/Run.schema.json
+#name: eval_groundedness_default_20230820_200152_009000
+flow: ../../evaluation/groundedness-eval
+run: chat_with_pdf_default_20230820_162219_559000
+column_mapping:
+  question: ${run.inputs.question}
+  answer: ${run.outputs.answer}
+  context: ${run.outputs.context}
\ No newline at end of file
diff --git a/examples/flows/chat/chat-with-pdf/find_context_tool.py b/examples/flows/chat/chat-with-pdf/find_context_tool.py
new file mode 100644
index 00000000000..246ceea2b73
--- /dev/null
+++ b/examples/flows/chat/chat-with-pdf/find_context_tool.py
@@ -0,0 +1,9 @@
+from promptflow import tool
+from chat_with_pdf.find_context import find_context
+
+
+@tool
+def find_context_tool(question: str, index_path: str):
+    prompt, context = find_context(question, index_path)
+
+    return {"prompt": prompt, "context": [c.text for c in context]}
diff --git a/examples/flows/chat/chat-with-pdf/flow.dag.yaml b/examples/flows/chat/chat-with-pdf/flow.dag.yaml
index 64b97bb4e08..0aa388e9c69 100644
--- a/examples/flows/chat/chat-with-pdf/flow.dag.yaml
+++ b/examples/flows/chat/chat-with-pdf/flow.dag.yaml
@@ -1,40 +1,32 @@
 inputs:
   chat_history:
     type: list
-    default:
-    - inputs:
-        question: what is BERT?
-      outputs:
-        answer: BERT (Bidirectional Encoder Representations from Transformers) is a
-          language representation model that pre-trains deep bidirectional
-          representations from unlabeled text by jointly conditioning on both
-          left and right context in all layers. Unlike other language
-          representation models, BERT can be fine-tuned with just one additional
-          output layer to create state-of-the-art models for a wide range of
-          tasks such as question answering and language inference, without
-          substantial task-specific architecture modifications. BERT is
-          effective for both fine-tuning and feature-based approaches. It
-          obtains new state-of-the-art results on eleven natural language
-          processing tasks, including pushing the GLUE score to 80.5% (7.7%
-          point absolute improvement), MultiNLI accuracy to 86.7% (4.6% absolute
-          improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point
-          absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point
-          absolute improvement).
+    default: []
   pdf_url:
     type: string
     default: https://arxiv.org/pdf/1810.04805.pdf
   question:
     type: string
     is_chat_input: true
-    default: what NLP tasks does it perform well?
+    default: what is BERT?
+  config:
+    type: object
+    default:
+      EMBEDDING_MODEL_DEPLOYMENT_NAME: text-embedding-ada-002
+      CHAT_MODEL_DEPLOYMENT_NAME: gpt-35-turbo
+      PROMPT_TOKEN_LIMIT: 3000
+      MAX_COMPLETION_TOKENS: 256
+      VERBOSE: true
+      CHUNK_SIZE: 1024
+      CHUNK_OVERLAP: 64
 outputs:
   answer:
     type: string
     is_chat_output: true
-    reference: ${chat_with_pdf_tool.output.answer}
+    reference: ${qna_tool.output.answer}
   context:
     type: string
-    reference: ${chat_with_pdf_tool.output.context}
+    reference: ${find_context_tool.output.context}
 nodes:
 - name: setup_env
   type: python
@@ -42,16 +34,44 @@ nodes:
     type: code
     path: setup_env.py
   inputs:
-    conn: chat_with_pdf_custom_connection
-- name: chat_with_pdf_tool
+    aoai_connection: azure_open_ai_connection
+    config: ${inputs.config}
+- name: download_tool
+  type: python
+  source:
+    type: code
+    path: download_tool.py
+  inputs:
+    url: ${inputs.pdf_url}
+    env_ready_signal: ${setup_env.output}
+- name: build_index_tool
+  type: python
+  source:
+    type: code
+    path: build_index_tool.py
+  inputs:
+    pdf_path: ${download_tool.output}
+- name: find_context_tool
   type: python
   source:
     type: code
-    path: chat_with_pdf_tool.py
+    path: find_context_tool.py
   inputs:
+    question: ${rewrite_question_tool.output}
+    index_path: ${build_index_tool.output}
+- name: qna_tool
+  type: python
+  source:
+    type: code
+    path: qna_tool.py
+  inputs:
+    prompt: ${find_context_tool.output.prompt}
     history: ${inputs.chat_history}
-    pdf_url: ${inputs.pdf_url}
+- name: rewrite_question_tool
+  type: python
+  source:
+    type: code
+    path: rewrite_question_tool.py
+  inputs:
     question: ${inputs.question}
-    ready: ${setup_env.output}
-environment:
-    python_requirements_txt: requirements.txt
+    history: ${inputs.chat_history}
diff --git a/examples/flows/chat/chat-with-pdf/flow.dag.yaml.single-node b/examples/flows/chat/chat-with-pdf/flow.dag.yaml.single-node
new file mode 100644
index 00000000000..5cf9c36f356
--- /dev/null
+++ b/examples/flows/chat/chat-with-pdf/flow.dag.yaml.single-node
@@ -0,0 +1,55 @@
+inputs:
+  chat_history:
+    type: list
+    default:
+    - inputs:
+        question: what is BERT?
+      outputs:
+        answer: BERT (Bidirectional Encoder Representations from Transformers) is a
+          language representation model that pre-trains deep bidirectional
+          representations from unlabeled text by jointly conditioning on both
+          left and right context in all layers. Unlike other language
+          representation models, BERT can be fine-tuned with just one additional
+          output layer to create state-of-the-art models for a wide range of
+          tasks such as question answering and language inference, without
+          substantial task-specific architecture modifications. BERT is
+          effective for both fine-tuning and feature-based approaches. It
+          obtains new state-of-the-art results on eleven natural language
+          processing tasks, including pushing the GLUE score to 80.5% (7.7%
+          point absolute improvement), MultiNLI accuracy to 86.7% (4.6% absolute
+          improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point
+          absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point
+          absolute improvement).
+  pdf_url:
+    type: string
+    default: https://arxiv.org/pdf/1810.04805.pdf
+  question:
+    type: string
+    is_chat_input: true
+    default: what NLP tasks does it perform well?
+outputs:
+  answer:
+    type: string
+    is_chat_output: true
+    reference: ${chat_with_pdf_tool.output.answer}
+  context:
+    type: string
+    reference: ${chat_with_pdf_tool.output.context}
+nodes:
+- name: setup_env
+  type: python
+  source:
+    type: code
+    path: setup_env.py
+  inputs:
+    conn: my_custom_connection
+- name: chat_with_pdf_tool
+  type: python
+  source:
+    type: code
+    path: chat_with_pdf_tool.py
+  inputs:
+    history: ${inputs.chat_history}
+    pdf_url: ${inputs.pdf_url}
+    question: ${inputs.question}
+    ready: ${setup_env.output}
diff --git a/examples/flows/chat/chat-with-pdf/qna_tool.py b/examples/flows/chat/chat-with-pdf/qna_tool.py
index 497a44c5c2d..98e131b75ef 100644
--- a/examples/flows/chat/chat-with-pdf/qna_tool.py
+++ b/examples/flows/chat/chat-with-pdf/qna_tool.py
@@ -1,25 +1,16 @@
-# flake8: noqa: E402
-import os
-import sys
-
 from promptflow import tool
-
-# append chat_with_pdf to sys.path so code inside it can discover its modules
-sys.path.append(f"{os.path.dirname(__file__)}/chat_with_pdf")
 from chat_with_pdf.qna import qna
 
 
 @tool
-def qna_tool(question: str, index_path: str, history: list):
-    stream, context = qna(
-        question, index_path, convert_chat_history_to_chatml_messages(history)
-    )
+def qna_tool(prompt: str, history: list):
+    stream = qna(prompt, convert_chat_history_to_chatml_messages(history))
 
     answer = ""
     for str in stream:
         answer = answer + str + ""
 
-    return {"answer": answer, "context": context}
+    return {"answer": answer}
 
 
 def convert_chat_history_to_chatml_messages(history):
diff --git a/examples/flows/chat/chat-with-pdf/rewrite_question_tool.py b/examples/flows/chat/chat-with-pdf/rewrite_question_tool.py
index 8ac0c0034b2..aa81fc60e57 100644
--- a/examples/flows/chat/chat-with-pdf/rewrite_question_tool.py
+++ b/examples/flows/chat/chat-with-pdf/rewrite_question_tool.py
@@ -1,11 +1,4 @@
-# flake8: noqa: E402
-import os
-import sys
-
 from promptflow import tool
-
-# append chat_with_pdf to sys.path so code inside it can discover its modules
-sys.path.append(f"{os.path.dirname(__file__)}/chat_with_pdf")
 from chat_with_pdf.rewrite_question import rewrite_question
 
 
diff --git a/examples/flows/chat/chat-with-pdf/runtime_env/Dockerfile b/examples/flows/chat/chat-with-pdf/runtime_env/Dockerfile
new file mode 100644
index 00000000000..7e8be714d9b
--- /dev/null
+++ b/examples/flows/chat/chat-with-pdf/runtime_env/Dockerfile
@@ -0,0 +1,3 @@
+FROM mcr.microsoft.com/azureml/promptflow/promptflow-runtime:latest
+COPY ./requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
\ No newline at end of file
diff --git a/examples/flows/chat/chat-with-pdf/runtime_env/build.sh b/examples/flows/chat/chat-with-pdf/runtime_env/build.sh
new file mode 100644
index 00000000000..4f668be8fd2
--- /dev/null
+++ b/examples/flows/chat/chat-with-pdf/runtime_env/build.sh
@@ -0,0 +1,17 @@
+registry_name=docker.io/modulesdkpreview
+image_tag=chat_with_pdf
+
+docker build -t "$image_tag" .
+
+docker_image_tag=$registry_name/$image_tag
+
+echo "Docker image tag: $docker_image_tag"
+docker tag "$image_tag" "$docker_image_tag"
+image_tag=$docker_image_tag
+
+echo "Start pushing image...$image_tag"
+docker push "$image_tag"
+
+# docker build -t chat_with_pdf .
+# docker tag chat_with_pdf docker.io/modulesdkpreview/chat_with_pdf
+# docker push docker.io/modulesdkpreview/chat_with_pdf
\ No newline at end of file
diff --git a/examples/flows/chat/chat-with-pdf/runtime_env/environment_runtime.yaml b/examples/flows/chat/chat-with-pdf/runtime_env/environment_runtime.yaml
new file mode 100644
index 00000000000..b396fea8cff
--- /dev/null
+++ b/examples/flows/chat/chat-with-pdf/runtime_env/environment_runtime.yaml
@@ -0,0 +1,13 @@
+$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
+name: chat_with_pdf
+image: docker.io/modulesdkpreview/chat_with_pdf
+inference_config:
+  liveness_route:
+    port: 8080
+    path: /health
+  readiness_route:
+    port: 8080
+    path: /health
+  scoring_route:
+    port: 8080
+    path: /score
diff --git a/examples/flows/chat/chat-with-pdf/runtime_env/requirements.txt b/examples/flows/chat/chat-with-pdf/runtime_env/requirements.txt
new file mode 100644
index 00000000000..8aa53ccc24d
--- /dev/null
+++ b/examples/flows/chat/chat-with-pdf/runtime_env/requirements.txt
@@ -0,0 +1,6 @@
+langchain
+PyPDF2
+faiss-cpu
+openai
+jinja2
+python-dotenv
\ No newline at end of file
diff --git a/examples/flows/chat/chat-with-pdf/setup_env.py b/examples/flows/chat/chat-with-pdf/setup_env.py
index db70c026096..b7f91f68afe 100644
--- a/examples/flows/chat/chat-with-pdf/setup_env.py
+++ b/examples/flows/chat/chat-with-pdf/setup_env.py
@@ -1,21 +1,27 @@
-# flake8: noqa: E402
 import os
-import sys
 
 from promptflow import tool
-from promptflow.connections import CustomConnection
+from promptflow.connections import AzureOpenAIConnection
 
-# append chat_with_pdf to sys.path so code inside it can discover its modules
-sys.path.append(f"{os.path.dirname(__file__)}/chat_with_pdf")
 from chat_with_pdf.utils.lock import acquire_lock
 
 
 @tool
-def setup_env(conn: CustomConnection):
-    if not conn:
+def setup_env(aoai_connection: AzureOpenAIConnection, config: dict):
+    if not aoai_connection or not config:
         return
-    for key in conn:
-        os.environ[key] = conn[key]
+
+    os.environ["OPENAI_API_BASE"] = aoai_connection.api_base
+    os.environ["OPENAI_API_KEY"] = aoai_connection.api_key
+    os.environ["OPENAI_API_VERSION"] = aoai_connection.api_version
+
+    if isinstance(config, str):
+        import json
+        # Workaround if runtime not passing config as dict
+        config = json.loads(config)
+
+    for key in config:
+        os.environ[key] = str(config[key])
 
     with acquire_lock("create_folder.lock"):
         if not os.path.exists(".pdfs"):
diff --git a/examples/flows/chat/chat-with-pdf/tests/azure_chat_with_pdf_test.py b/examples/flows/chat/chat-with-pdf/tests/azure_chat_with_pdf_test.py
new file mode 100644
index 00000000000..dc18dc2b6bd
--- /dev/null
+++ b/examples/flows/chat/chat-with-pdf/tests/azure_chat_with_pdf_test.py
@@ -0,0 +1,161 @@
+import unittest
+import promptflow.azure as azure
+from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
+from azure.ai.ml import MLClient
+from base_test import BaseTest
+import time
+import os
+
+
+class TestChatWithPDFAzure(BaseTest):
+    def setUp(self):
+        super().setUp()
+        self.data_path = os.path.join(self.flow_path, "data/bert-paper-qna-3-line.jsonl")
+
+        try:
+            credential = DefaultAzureCredential()
+            # Check if given credential can get token successfully.
+            credential.get_token("https://management.azure.com/.default")
+        except Exception:
+            # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
+            credential = InteractiveBrowserCredential()
+
+        ml_client = MLClient.from_config(credential=credential)
+
+        self.pf = azure.PFClient(ml_client)
+        self.runtime = "chat_with_pdf_runtime"
+        # self.runtime = None  # serverless
+
+    def tearDown(self) -> None:
+        return super().tearDown()
+
+    def test_bulk_run_chat_with_pdf(self):
+        run = self.create_chat_run(runtime=self.runtime)
+        self.pf.stream(run)  # wait for completion
+
+        self.assertEqual(run.status, "Completed")
+        details = self.pf.get_details(run)
+        self.assertEqual(details.shape[0], 3)
+
+    def test_eval(self):
+        display_name = "chat_with_pdf_2k_context"
+        run2k = self.create_chat_run(
+            column_mapping={
+                "question": "${data.question}",
+                "pdf_url": "${data.pdf_url}",
+                "chat_history": "${data.chat_history}",
+                "config": self.config_2k_context,
+            },
+            runtime=self.runtime,
+            display_name=display_name,
+        )
+        self.pf.stream(run2k)  # wait for completion
+        self.check_run_basics(run2k, display_name)
+
+        display_name = "eval_groundedness_2k_context"
+        eval2k_groundedness = self.create_eval_run(
+            self.eval_groundedness_flow_path,
+            run2k,
+            {
+                "question": "${run.inputs.question}",
+                "answer": "${run.outputs.answer}",
+                "context": "${run.outputs.context}",
+            },
+            runtime=self.runtime,
+            display_name=display_name,
+        )
+        self.pf.stream(eval2k_groundedness)  # wait for completion
+        self.check_run_basics(eval2k_groundedness, display_name)
+
+        details = self.pf.get_details(eval2k_groundedness)
+        self.assertGreater(details.shape[0], 2)
+
+        metrics, elapsed = self.wait_for_metrics(eval2k_groundedness)
+        self.assertGreaterEqual(metrics["groundedness"], 0.0)
+        self.assertLessEqual(elapsed, 5)  # metrics should be available within 5 seconds
+
+        display_name = "eval_perceived_intelligence_2k_context"
+        eval2k_pi = self.create_eval_run(
+            self.eval_perceived_intelligence_flow_path,
+            run2k,
+            {
+                "question": "${run.inputs.question}",
+                "answer": "${run.outputs.answer}",
+                "context": "${run.outputs.context}",
+            },
+            runtime=self.runtime,
+            display_name=display_name,
+        )
+        self.pf.stream(eval2k_pi)  # wait for completion
+        self.check_run_basics(eval2k_pi, display_name)
+
+        details = self.pf.get_details(eval2k_pi)
+        self.assertGreater(details.shape[0], 2)
+
+        metrics, elapsed = self.wait_for_metrics(eval2k_pi)
+        self.assertGreaterEqual(metrics["perceived_intelligence_score"], 0.0)
+        self.assertLessEqual(elapsed, 5)  # metrics should be available within 5 seconds
+
+    def test_bulk_run_valid_mapping(self):
+        data = os.path.join(self.flow_path, "data/bert-paper-qna-1-line.jsonl")
+        run = self.create_chat_run(
+            data=data,
+            column_mapping={
+                "question": "${data.question}",
+                "pdf_url": "${data.pdf_url}",
+                "chat_history": "${data.chat_history}",
+                "config": self.config_2k_context,
+            },
+            runtime=self.runtime,
+        )
+        self.pf.stream(run)  # wait for completion
+
+        self.assertEqual(run.status, "Completed")
+        details = self.pf.get_details(run)
+        self.assertEqual(details.shape[0], 1)
+
+    def test_bulk_run_mapping_missing_one_column(self):
+        run = self.create_chat_run(
+            column_mapping={
+                "question": "${data.question}",
+                "pdf_url": "${data.pdf_url}",
+            },
+            runtime=self.runtime,
+        )
+        self.pf.stream(run)  # wait for completion
+
+        self.assertEqual(run.status, "Failed")
+        with self.assertRaises(Exception):
+            _ = self.pf.get_details(run)
+
+    def test_bulk_run_invalid_mapping(self):
+        run = self.create_chat_run(
+            column_mapping={
+                "question": "${data.question_not_exist}",
+                "pdf_url": "${data.pdf_url}",
+                "chat_history": "${data.chat_history}",
+            },
+            runtime=self.runtime,
+        )
+
+        self.pf.stream(run)  # wait for completion
+
+        self.assertEqual(run.status, "Failed")
+        with self.assertRaises(Exception):
+            _ = self.pf.get_details(run)
+
+    def wait_for_metrics(self, run):
+        start = time.time()
+        metrics = self.pf.get_metrics(run)
+        cnt = 3
+        while len(metrics) == 0 and cnt > 0:
+            time.sleep(5)
+            metrics = self.pf.get_metrics(run)
+            cnt -= 1
+
+        end = time.time()
+        return metrics, end - start
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/examples/flows/chat/chat-with-pdf/tests/base_test.py b/examples/flows/chat/chat-with-pdf/tests/base_test.py
new file mode 100644
index 00000000000..a6bc3769e8b
--- /dev/null
+++ b/examples/flows/chat/chat-with-pdf/tests/base_test.py
@@ -0,0 +1,100 @@
+import unittest
+import os
+import json
+import traceback
+
+
+class BaseTest(unittest.TestCase):
+    def setUp(self):
+        root = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../")
+        self.flow_path = os.path.join(root, "chat-with-pdf")
+        self.data_path = os.path.join(
+            self.flow_path, "data/bert-paper-qna-1-line.jsonl"
+        )
+        self.eval_groundedness_flow_path = os.path.join(root, "../evaluation/groundedness-eval")
+        self.eval_perceived_intelligence_flow_path = os.path.join(
+            root, "../evaluation/perceived-intelligence-eval"
+        )
+        self.all_runs_generated = []
+        self.config_3k_context = {
+            "EMBEDDING_MODEL_DEPLOYMENT_NAME": "text-embedding-ada-002",
+            "CHAT_MODEL_DEPLOYMENT_NAME": "gpt-35-turbo",
+            "PROMPT_TOKEN_LIMIT": 3000,
+            "MAX_COMPLETION_TOKENS": 256,
+            "VERBOSE": True,
+            "CHUNK_SIZE": 256,
+            "CHUNK_OVERLAP": 32,
+        }
+        self.config_2k_context = {
+            "EMBEDDING_MODEL_DEPLOYMENT_NAME": "text-embedding-ada-002",
+            "CHAT_MODEL_DEPLOYMENT_NAME": "gpt-35-turbo",
+            "PROMPT_TOKEN_LIMIT": 2000,
+            "MAX_COMPLETION_TOKENS": 256,
+            "VERBOSE": True,
+            "CHUNK_SIZE": 256,
+            "CHUNK_OVERLAP": 32,
+        }
+        # TODO remove this when object passing is supported
+        self.config_3k_context = json.dumps(self.config_3k_context)
+        self.config_2k_context = json.dumps(self.config_2k_context)
+
+        # Switch current working directory to the folder of this file
+        self.cwd = os.getcwd()
+        os.chdir(os.path.dirname(os.path.abspath(__file__)))
+
+    def tearDown(self):
+        # Switch back to the original working directory
+        os.chdir(self.cwd)
+
+        for run in self.all_runs_generated:
+            try:
+                self.pf.runs.archive(run.name)
+            except Exception as e:
+                print(e)
+                traceback.print_exc()
+
+    def create_chat_run(self, data=None, column_mapping=None, connections=None, runtime=None, display_name='chat_run'):
+        if column_mapping is None:
+            column_mapping = {
+                "chat_history": "${data.chat_history}",
+                "pdf_url": "${data.pdf_url}",
+                "question": "${data.question}",
+                "config": self.config_2k_context,
+            }
+        data = self.data_path if data is None else data
+
+        run = self.pf.run(
+            flow=self.flow_path,
+            data=data,
+            column_mapping=column_mapping,
+            connections=connections,
+            runtime=runtime,
+            display_name=display_name,
+            tags={"unittest": "true"},
+            stream=True,
+        )
+        self.all_runs_generated.append(run)
+        self.check_run_basics(run, display_name)
+        return run
+
+    def create_eval_run(
+        self, eval_flow_path, base_run, column_mapping, connections=None, runtime=None, display_name=None
+    ):
+        eval = self.pf.run(
+            flow=eval_flow_path,
+            run=base_run,
+            column_mapping=column_mapping,
+            connections=connections,
+            runtime=runtime,
+            display_name=display_name,
+            tags={"unittest": "true"},
+            stream=True,
+        )
+        self.all_runs_generated.append(eval)
+        self.check_run_basics(eval, display_name)
+        return eval
+
+    def check_run_basics(self, run, display_name):
+        self.assertTrue(run is not None)
+        self.assertEqual(run.display_name, display_name)
+        self.assertEqual(run.tags["unittest"], "true")
diff --git a/examples/flows/chat/chat-with-pdf/tests/chat_with_pdf_test.py b/examples/flows/chat/chat-with-pdf/tests/chat_with_pdf_test.py
new file mode 100644
index 00000000000..42c398a7f12
--- /dev/null
+++ b/examples/flows/chat/chat-with-pdf/tests/chat_with_pdf_test.py
@@ -0,0 +1,123 @@
+import unittest
+import promptflow
+from base_test import BaseTest
+from promptflow.executor._errors import InputNotFoundInInputsMapping
+
+
+class TestChatWithPDF(BaseTest):
+    def setUp(self):
+        super().setUp()
+        self.pf = promptflow.PFClient()
+
+    def tearDown(self) -> None:
+        return super().tearDown()
+
+    def test_run_chat_with_pdf(self):
+        result = self.pf.test(
+            flow=self.flow_path,
+            inputs={
+                "chat_history": [],
+                "pdf_url": "https://arxiv.org/pdf/1810.04805.pdf",
+                "question": "BERT stands for?",
+                "config": self.config_2k_context,
+            },
+        )
+        print(result)
+        self.assertTrue(
+            result["answer"].find(
+                "Bidirectional Encoder Representations from Transformers"
+            )
+            != -1
+        )
+
+    def test_bulk_run_chat_with_pdf(self):
+        run = self.create_chat_run()
+        self.pf.stream(run)  # wait for completion
+
+        self.assertEqual(run.status, "Completed")
+        details = self.pf.get_details(run)
+        self.assertEqual(details.shape[0], 1)
+
+    def test_eval(self):
+        run = self.create_chat_run()
+        self.pf.stream(run)  # wait for completion
+        self.assertEqual(run.status, "Completed")
+
+        display_name = 'groundedness_eval'
+        eval_run = self.create_eval_run(
+            self.eval_groundedness_flow_path,
+            run,
+            {
+                "question": "${run.inputs.question}",
+                "answer": "${run.outputs.answer}",
+                "context": "${run.outputs.context}",
+            },
+            display_name=display_name,
+        )
+        self.pf.stream(eval_run)  # wait for completion
+        self.assertEqual(eval_run.status, "Completed")
+
+        details = self.pf.get_details(eval_run)
+        self.assertEqual(details.shape[0], 1)
+
+        metrics = self.pf.get_metrics(eval_run)
+        self.assertGreaterEqual(metrics["groundedness"], 0.0)
+
+        eval_run = self.create_eval_run(
+            self.eval_perceived_intelligence_flow_path,
+            run,
+            {
+                "question": "${run.inputs.question}",
+                "answer": "${run.outputs.answer}",
+                "context": "${run.outputs.context}",
+            },
+            display_name=display_name,
+        )
+        self.pf.stream(eval_run)  # wait for completion
+        self.assertEqual(eval_run.status, "Completed")
+
+        details = self.pf.get_details(eval_run)
+        self.assertEqual(details.shape[0], 1)
+
+        metrics = self.pf.get_metrics(eval_run)
+        self.assertGreaterEqual(metrics["perceived_intelligence_score"], 0.0)
+
+    def test_bulk_run_valid_mapping(self):
+        run = self.create_chat_run(
+            column_mapping={
+                "question": "${data.question}",
+                "pdf_url": "${data.pdf_url}",
+                "chat_history": "${data.chat_history}",
+                "config": self.config_2k_context,
+            }
+        )
+        self.pf.stream(run)  # wait for completion
+
+        self.assertEqual(run.status, "Completed")
+        details = self.pf.get_details(run)
+        self.assertEqual(details.shape[0], 1)
+
+    def test_bulk_run_mapping_missing_one_column(self):
+        # in this case, run won't be created.
+        with self.assertRaises(InputNotFoundInInputsMapping):
+            self.create_chat_run(
+                column_mapping={
+                    "question": "${data.question}",
+                    "pdf_url": "${data.pdf_url}",
+                }
+            )
+
+    def test_bulk_run_invalid_mapping(self):
+        # in this case, run won't be created.
+        with self.assertRaises(InputNotFoundInInputsMapping):
+            self.create_chat_run(
+                column_mapping={
+                    "question": "${data.question_not_exist}",
+                    "pdf_url": "${data.pdf_url}",
+                    "chat_history": "${data.chat_history}",
+                }
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/examples/flows/evaluation/classification-accuracy-eval/README.md b/examples/flows/evaluation/classification-accuracy-eval/README.md
index eb05bc3d8c3..688555551c3 100644
--- a/examples/flows/evaluation/classification-accuracy-eval/README.md
+++ b/examples/flows/evaluation/classification-accuracy-eval/README.md
@@ -12,6 +12,15 @@ In this flow, you will learn
 - the way to log metrics. use `from promptflow import log_metric`
     - see file [calculate_accuracy.py](calculate_accuracy.py)
 
+### 0. Setup connection
+
+Prepare your Azure Open AI resource follow this [instruction](https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal) and get your `api_key` if you don't have one.
+
+```bash
+# Override keys with --set to avoid yaml file changes
+pf connection create --file ../../../connections/azure_openai.yml --set api_key=<your_api_key> api_base=<your_api_base>
+```
+
 ### 1. Test flow/node
 
 ```bash
diff --git a/examples/flows/evaluation/classification-accuracy-eval/requirements.txt b/examples/flows/evaluation/classification-accuracy-eval/requirements.txt
index 923651dbfd6..16306751554 100644
--- a/examples/flows/evaluation/classification-accuracy-eval/requirements.txt
+++ b/examples/flows/evaluation/classification-accuracy-eval/requirements.txt
@@ -1,5 +1,3 @@
 --extra-index-url https://azuremlsdktestpypi.azureedge.net/promptflow/
 promptflow
-promptflow-tools
-langchain
-jinja2
\ No newline at end of file
+promptflow-tools
\ No newline at end of file
diff --git a/examples/flows/evaluation/eval-entity-match-rate/README.md b/examples/flows/evaluation/eval-entity-match-rate/README.md
new file mode 100644
index 00000000000..041ea7364c3
--- /dev/null
+++ b/examples/flows/evaluation/eval-entity-match-rate/README.md
@@ -0,0 +1,27 @@
+# Entity match rate evaluation
+
+This is a flow evaluates: entity match rate.
+
+Tools used in this flow：
+- `python` tool
+
+## Prerequisites
+
+Install promptflow sdk and other dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+### 1. Test flow/node
+
+```bash
+# test with default input value in flow.dag.yaml
+pf flow test --flow .
+```
+
+### 2. create flow run with multi line data
+
+```bash
+pf run create --flow . --data ./data.jsonl --stream
+```
+
diff --git a/examples/flows/evaluation/eval-entity-match-rate/cleansing.py b/examples/flows/evaluation/eval-entity-match-rate/cleansing.py
new file mode 100644
index 00000000000..a017d7f1ab9
--- /dev/null
+++ b/examples/flows/evaluation/eval-entity-match-rate/cleansing.py
@@ -0,0 +1,11 @@
+from typing import List
+from promptflow import tool
+
+
+@tool
+def cleansing(entities_str: str) -> List[str]:
+    # Split, remove leading and trailing spaces/tabs/dots
+    parts = entities_str.split(",")
+    cleaned_parts = [part.strip(" \t.\"") for part in parts]
+    entities = [part for part in cleaned_parts if len(part) > 0]
+    return entities
diff --git a/examples/flows/evaluation/eval-entity-match-rate/data.jsonl b/examples/flows/evaluation/eval-entity-match-rate/data.jsonl
new file mode 100644
index 00000000000..0120e04fb3f
--- /dev/null
+++ b/examples/flows/evaluation/eval-entity-match-rate/data.jsonl
@@ -0,0 +1,2 @@
+{"entities": ["software engineer","CEO"],"ground_truth": "\"CEO, Software Engineer, Finance Manager\""}
+{"entities": ["Software Engineer","CEO", "Finance Manager"],"ground_truth": "\"CEO, Software Engineer, Finance Manager\""}
diff --git a/examples/flows/evaluation/eval-entity-match-rate/flow.dag.yaml b/examples/flows/evaluation/eval-entity-match-rate/flow.dag.yaml
new file mode 100644
index 00000000000..b9d833b8322
--- /dev/null
+++ b/examples/flows/evaluation/eval-entity-match-rate/flow.dag.yaml
@@ -0,0 +1,39 @@
+inputs:
+  entities:
+    type: list
+    default:
+    - software engineer
+    - CEO
+  ground_truth:
+    type: string
+    default: '"CEO, Software Engineer, Finance Manager"'
+outputs:
+  match_cnt:
+    type: object
+    reference: ${match.output}
+nodes:
+- name: cleansing
+  type: python
+  source:
+    type: code
+    path: cleansing.py
+  inputs:
+    entities_str: ${inputs.ground_truth}
+- name: match
+  type: python
+  source:
+    type: code
+    path: match.py
+  inputs:
+    answer: ${inputs.entities}
+    ground_truth: ${cleansing.output}
+- name: log_metrics
+  type: python
+  source:
+    type: code
+    path: log_metrics.py
+  inputs:
+    match_counts: ${match.output}
+  aggregation: true
+environment:
+  python_requirements_txt: requirements.txt
\ No newline at end of file
diff --git a/examples/flows/evaluation/eval-entity-match-rate/is_match_test.py b/examples/flows/evaluation/eval-entity-match-rate/is_match_test.py
new file mode 100644
index 00000000000..5da9d1d5c84
--- /dev/null
+++ b/examples/flows/evaluation/eval-entity-match-rate/is_match_test.py
@@ -0,0 +1,13 @@
+import unittest
+
+from match import is_match
+
+
+class IsMatchTest(unittest.TestCase):
+    def test_normal(self):
+        self.assertEqual(is_match(["a", "b"], ["B", "a"], True, True, False), True)
+        self.assertEqual(is_match(["a", "b"], ["B", "a"], True, False, False), False)
+        self.assertEqual(is_match(["a", "b"], ["B", "a"], False, True, False), False)
+        self.assertEqual(is_match(["a", "b"], ["B", "a"], False, False, True), False)
+        self.assertEqual(is_match(["a", "b"], ["a", "b"], False, False, False), True)
+        self.assertEqual(is_match(["a", "b"], ["a", "b", "c"], True, False, True), True)
diff --git a/examples/flows/evaluation/eval-entity-match-rate/log_metrics.py b/examples/flows/evaluation/eval-entity-match-rate/log_metrics.py
new file mode 100644
index 00000000000..bf45cca8b31
--- /dev/null
+++ b/examples/flows/evaluation/eval-entity-match-rate/log_metrics.py
@@ -0,0 +1,20 @@
+from promptflow import tool
+from typing import List
+from promptflow import log_metric
+
+# The inputs section will change based on the arguments of the tool function, after you save the code
+# Adding type to arguments and return value will help the system show the types properly
+# Please update the function name/signature per need
+
+
+@tool
+def log_metrics(match_counts: List[dict]):
+    exact_match_rate = sum([m["exact_match"] for m in match_counts]) / len(match_counts)
+    partial_match_rate = sum([m["partial_match"] for m in match_counts]) / len(match_counts)
+
+    log_metric(key="exact_match_rate", value=exact_match_rate)
+    log_metric(key="partial_match_rate", value=partial_match_rate)
+    print("exact_match_rate: ", exact_match_rate)
+    print("partial_match_rate: ", partial_match_rate)
+
+    return {"exact_match_rate": exact_match_rate, "partial_match_rate": partial_match_rate}
diff --git a/examples/flows/evaluation/eval-entity-match-rate/match.py b/examples/flows/evaluation/eval-entity-match-rate/match.py
new file mode 100644
index 00000000000..ae7dc993130
--- /dev/null
+++ b/examples/flows/evaluation/eval-entity-match-rate/match.py
@@ -0,0 +1,37 @@
+from promptflow import tool
+from typing import List
+
+
+@tool
+def match(answer: List[str], ground_truth: List[str]):
+    exact_match = 0
+    partial_match = 0
+
+    if is_match(answer, ground_truth, ignore_case=True, ignore_order=True, allow_partial=False):
+        exact_match = 1
+
+    if is_match(answer, ground_truth, ignore_case=True, ignore_order=True, allow_partial=True):
+        partial_match = 1
+
+    return {"exact_match": exact_match, "partial_match": partial_match, "answer": answer, "ground_truth": ground_truth}
+
+
+def is_match(
+        answer: List[str],
+        ground_truth: List[str],
+        ignore_case: bool,
+        ignore_order: bool,
+        allow_partial: bool) -> bool:
+    if ignore_case:
+        answer = [a.lower() for a in answer]
+        ground_truth = [g.lower() for g in ground_truth]
+
+    if ignore_order:
+        answer.sort()
+        ground_truth.sort()
+
+    if allow_partial:
+        x = [a for a in answer if a in ground_truth]
+        return x == answer
+
+    return answer == ground_truth
diff --git a/examples/flows/evaluation/eval-entity-match-rate/requirements.txt b/examples/flows/evaluation/eval-entity-match-rate/requirements.txt
new file mode 100644
index 00000000000..16306751554
--- /dev/null
+++ b/examples/flows/evaluation/eval-entity-match-rate/requirements.txt
@@ -0,0 +1,3 @@
+--extra-index-url https://azuremlsdktestpypi.azureedge.net/promptflow/
+promptflow
+promptflow-tools
\ No newline at end of file
diff --git a/examples/flows/evaluation/groundedness-eval/README.md b/examples/flows/evaluation/groundedness-eval/README.md
new file mode 100644
index 00000000000..90ee849da25
--- /dev/null
+++ b/examples/flows/evaluation/groundedness-eval/README.md
@@ -0,0 +1,30 @@
+# Groundedness Evaluation
+
+This is a flow leverage llm to eval groundedness: whether answer is stating facts that are all present in the given context.
+
+Tools used in this flow：
+- `python` tool
+- built-in `llm` tool
+
+### 0. Setup connection
+
+Prepare your Azure Open AI resource follow this [instruction](https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal) and get your `api_key` if you don't have one.
+
+```bash
+# Override keys with --set to avoid yaml file changes
+pf connection create --file ../../../connections/azure_openai.yml --set api_key=<your_api_key> api_base=<your_api_base>
+```
+
+### 1. Test flow/node
+
+```bash
+# test with default input value in flow.dag.yaml
+pf flow test --flow .
+```
+
+### 2. create flow run with multi line data
+
+```bash
+pf run create --flow . --data ./data.jsonl --stream
+```
+
diff --git a/examples/flows/evaluation/groundedness-eval/aggregate.py b/examples/flows/evaluation/groundedness-eval/aggregate.py
new file mode 100644
index 00000000000..7ca1c8b04c1
--- /dev/null
+++ b/examples/flows/evaluation/groundedness-eval/aggregate.py
@@ -0,0 +1,30 @@
+from typing import List
+from promptflow import tool
+
+
+@tool
+def aggregate(groundedness_scores: List[float]):
+    """
+    This tool aggregates the processed result of all lines to the variant level and log metric for each variant.
+
+    :param processed_results: List of the output of line_process node.
+    :param variant_ids: List of variant ids that can be used to group the results by variant.
+    :param line_numbers: List of line numbers of the variants. If provided, this can be used to
+                        group the results by line number.
+    """
+
+    aggregated_results = {"groundedness": 0.0, "count": 0}
+
+    # Calculate average groundedness score for each variant
+    for i in range(len(groundedness_scores)):
+        aggregated_results["groundedness"] += groundedness_scores[i]
+        aggregated_results["count"] += 1
+
+    aggregated_results["groundedness"] /= aggregated_results["count"]
+
+    # Log metric for each variant
+    from promptflow import log_metric
+
+    log_metric(key="groundedness", value=aggregated_results["groundedness"])
+
+    return aggregated_results
diff --git a/examples/flows/evaluation/groundedness-eval/calc_groundedness.py b/examples/flows/evaluation/groundedness-eval/calc_groundedness.py
new file mode 100644
index 00000000000..0375ad65729
--- /dev/null
+++ b/examples/flows/evaluation/groundedness-eval/calc_groundedness.py
@@ -0,0 +1,15 @@
+from promptflow import tool
+import re
+
+
+@tool
+def parse_score(gpt_score: str):
+    return float(extract_float(gpt_score))
+
+
+def extract_float(s):
+    match = re.search(r"[-+]?\d*\.\d+|\d+", s)
+    if match:
+        return float(match.group())
+    else:
+        return None
diff --git a/examples/flows/evaluation/groundedness-eval/data.jsonl b/examples/flows/evaluation/groundedness-eval/data.jsonl
new file mode 100644
index 00000000000..66aaf8cb27a
--- /dev/null
+++ b/examples/flows/evaluation/groundedness-eval/data.jsonl
@@ -0,0 +1,2 @@
+{"question": "What is the name of the new language representation model introduced in the document?", "variant_id": "v1", "line_number":1, "answer":"The document mentions multiple language representation models, so it is unclear which one is being referred to as \"new\". Can you provide more specific information or context?","context":["statistical language modeling. arXiv preprint arXiv:1312.3005 . Z. Chen, H. Zhang, X. Zhang, and L. Zhao. 2018. Quora question pairs. Christopher Clark and Matt Gardner. 2018. Simple and effective multi-paragraph reading comprehen- sion. In ACL.Kevin Clark, Minh-Thang Luong, Christopher D Man- ning, and Quoc Le. 2018. Semi-supervised se- quence modeling with cross-view training. In Pro- ceedings of the 2018 Conference on Empirical Meth- ods in Natural Language Processing , pages 1914\u2013 1925. Ronan Collobert and Jason Weston. 2008. A uni\ufb01ed architecture for natural language processing: Deep neural networks with multitask learning. In Pro- ceedings of the 25th international conference on Machine learning , pages 160\u2013167. ACM. Alexis Conneau, Douwe Kiela, Holger Schwenk, Lo \u00a8\u0131c Barrault, and Antoine Bordes. 2017. Supervised learning of universal sentence representations from natural language inference data. In Proceedings of the 2017 Conference on Empirical Methods in Nat- ural Language Processing , pages 670\u2013680, Copen- hagen, Denmark. Association for Computational Linguistics. Andrew M Dai and Quoc V Le. 2015. Semi-supervised sequence learning. In Advances in neural informa- tion processing systems , pages 3079\u20133087. J. Deng, W. Dong, R. Socher, L.-J. Li, K. Li, and L. Fei- Fei. 2009. ImageNet: A Large-Scale Hierarchical Image Database. In CVPR09 . William B Dolan and Chris Brockett. 2005. Automati- cally constructing a corpus of sentential paraphrases. InProceedings of the Third International Workshop on Paraphrasing (IWP2005) . William Fedus, Ian Goodfellow, and Andrew M Dai. 2018. Maskgan: Better text generation via \ufb01lling in the.arXiv preprint arXiv:1801.07736 . Dan Hendrycks and Kevin Gimpel. 2016. Bridging nonlinearities and stochastic regularizers with gaus- sian error linear units. CoRR , abs\/1606.08415. Felix Hill, Kyunghyun Cho, and Anna Korhonen. 2016. Learning distributed representations of sentences from unlabelled data. In Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies . Association for Computa- tional Linguistics. Jeremy Howard and Sebastian Ruder. 2018. Universal language model \ufb01ne-tuning for text classi\ufb01cation. In ACL. Association for Computational Linguistics. Minghao Hu, Yuxing Peng, Zhen Huang, Xipeng Qiu, Furu Wei, and Ming Zhou. 2018. Reinforced mnemonic reader for machine reading comprehen- sion. In IJCAI . Yacine Jernite, Samuel R. Bowman, and David Son- tag. 2017. Discourse-based objectives for fast un- supervised sentence representation learning. CoRR , abs\/1705.00557.Mandar Joshi, Eunsol Choi, Daniel S Weld, and Luke Zettlemoyer. 2017. Triviaqa: A large scale distantly supervised challenge dataset for reading comprehen- sion. In ACL. Ryan Kiros, Yukun Zhu, Ruslan R Salakhutdinov, Richard Zemel, Raquel Urtasun, Antonio Torralba, and Sanja Fidler. 2015. Skip-thought vectors. In Advances in neural information processing systems , pages 3294\u20133302. Quoc Le and Tomas Mikolov. 2014. Distributed rep- resentations of sentences and documents. In Inter- national Conference on Machine Learning , pages 1188\u20131196. Hector J Levesque, Ernest Davis, and Leora Morgen- stern. 2011. The winograd schema challenge. In Aaai spring symposium: Logical formalizations of commonsense reasoning , volume 46, page 47. Lajanugen Logeswaran and Honglak Lee. 2018. An ef\ufb01cient framework for learning sentence represen- tations. In International Conference on Learning Representations . Bryan McCann, James Bradbury, Caiming Xiong, and Richard Socher. 2017. Learned in translation: Con-","tool for measuring readability. Journalism Bulletin , 30(4):415\u2013433. Erik F Tjong Kim Sang and Fien De Meulder. 2003. Introduction to the conll-2003 shared task: Language-independent named entity recognition. In CoNLL . Joseph Turian, Lev Ratinov, and Yoshua Bengio. 2010. Word representations: A simple and general method for semi-supervised learning. In Proceedings of the 48th Annual Meeting of the Association for Compu- tational Linguistics , ACL \u201910, pages 384\u2013394. Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information Pro- cessing Systems , pages 6000\u20136010. Pascal Vincent, Hugo Larochelle, Yoshua Bengio, and Pierre-Antoine Manzagol. 2008. Extracting and composing robust features with denoising autoen- coders. In Proceedings of the 25th international conference on Machine learning , pages 1096\u20131103. ACM. Alex Wang, Amanpreet Singh, Julian Michael, Fe- lix Hill, Omer Levy, and Samuel Bowman. 2018a. Glue: A multi-task benchmark and analysis platformfor natural language understanding. In Proceedings of the 2018 EMNLP Workshop BlackboxNLP: An- alyzing and Interpreting Neural Networks for NLP , pages 353\u2013355. Wei Wang, Ming Yan, and Chen Wu. 2018b. Multi- granularity hierarchical attention fusion networks for reading comprehension and question answering. InProceedings of the 56th Annual Meeting of the As- sociation for Computational Linguistics (Volume 1: Long Papers) . Association for Computational Lin- guistics. Alex Warstadt, Amanpreet Singh, and Samuel R Bow- man. 2018. Neural network acceptability judg- ments. arXiv preprint arXiv:1805.12471 . Adina Williams, Nikita Nangia, and Samuel R Bow- man. 2018. A broad-coverage challenge corpus for sentence understanding through inference. In NAACL . Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. 2016. Google\u2019s neural ma- chine translation system: Bridging the gap between human and machine translation. arXiv preprint arXiv:1609.08144 . Jason Yosinski, Jeff Clune, Yoshua Bengio, and Hod Lipson. 2014. How transferable are features in deep neural networks? In Advances in neural information processing systems , pages 3320\u20133328. Adams Wei Yu, David Dohan, Minh-Thang Luong, Rui Zhao, Kai Chen, Mohammad Norouzi, and Quoc V Le. 2018. QANet: Combining local convolution with global self-attention for reading comprehen- sion. In ICLR . Rowan Zellers, Yonatan Bisk, Roy Schwartz, and Yejin Choi. 2018. Swag: A large-scale adversarial dataset for grounded commonsense inference. In Proceed- ings of the 2018 Conference on Empirical Methods in Natural Language Processing (EMNLP) . Yukun Zhu, Ryan Kiros, Rich Zemel, Ruslan Salakhut- dinov, Raquel Urtasun, Antonio Torralba, and Sanja Fidler. 2015. Aligning books and movies: Towards story-like visual explanations by watching movies and reading books. In Proceedings of the IEEE international conference on computer vision , pages 19\u201327. Appendix for \u201cBERT: Pre-training of Deep Bidirectional Transformers for Language Understanding\u201d We organize the appendix into three sections: \u2022 Additional implementation details for BERT are presented in Appendix A;\u2022 Additional details for our experiments are presented in Appendix B; and \u2022 Additional ablation studies are presented in Appendix C. We present additional ablation studies for BERT including: \u2013Effect of Number of Training Steps; and \u2013Ablation for Different"]}
+{"question": "What is the main difference between BERT and previous language representation models?", "variant_id": "v1", "line_number": 2, "answer":"BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers, allowing it to incorporate context from both directions. This is unlike previous language representation models that are unidirectional, which limits the choice of architectures that can be used during pre-training and could be sub-optimal for sentence-level tasks and token-level tasks such as question answering.","context":["BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding Jacob Devlin Ming-Wei Chang Kenton Lee Kristina Toutanova Google AI Language fjacobdevlin,mingweichang,kentonl,kristout g@google.com Abstract We introduce a new language representa- tion model called BERT , which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language repre- sentation models (Peters et al., 2018a; Rad- ford et al., 2018), BERT is designed to pre- train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a re- sult, the pre-trained BERT model can be \ufb01ne- tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task- speci\ufb01c architecture modi\ufb01cations. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art re- sults on eleven natural language processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute improvement), MultiNLI accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answer- ing Test F1 to 93.2 (1.5 point absolute im- provement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement). 1 Introduction Language model pre-training has been shown to be effective for improving many natural language processing tasks (Dai and Le, 2015; Peters et al., 2018a; Radford et al., 2018; Howard and Ruder, 2018). These include sentence-level tasks such as natural language inference (Bowman et al., 2015; Williams et al., 2018) and paraphrasing (Dolan and Brockett, 2005), which aim to predict the re- lationships between sentences by analyzing them holistically, as well as token-level tasks such as named entity recognition and question answering, where models are required to produce \ufb01ne-grained output at the token level (Tjong Kim Sang and De Meulder, 2003; Rajpurkar et al., 2016).There are two existing strategies for apply- ing pre-trained language representations to down- stream tasks: feature-based and\ufb01ne-tuning . The feature-based approach, such as ELMo (Peters et al., 2018a), uses task-speci\ufb01c architectures that include the pre-trained representations as addi- tional features. The \ufb01ne-tuning approach, such as the Generative Pre-trained Transformer (OpenAI GPT) (Radford et al., 2018), introduces minimal task-speci\ufb01c parameters, and is trained on the downstream tasks by simply \ufb01ne-tuning allpre- trained parameters. The two approaches share the same objective function during pre-training, where they use unidirectional language models to learn general language representations. We argue that current techniques restrict the power of the pre-trained representations, espe- cially for the \ufb01ne-tuning approaches. The ma- jor limitation is that standard language models are unidirectional, and this limits the choice of archi- tectures that can be used during pre-training. For example, in OpenAI GPT, the authors use a left-to- right architecture, where every token can only at- tend to previous tokens in the self-attention layers of the Transformer (Vaswani et al., 2017). Such re- strictions are sub-optimal for sentence-level tasks, and could be very harmful when applying \ufb01ne- tuning based approaches to token-level tasks such as question answering, where it is crucial to incor- porate context from both directions. In this paper, we improve the \ufb01ne-tuning based approaches by proposing BERT: Bidirectional Encoder Representations from Transformers.","the self-attention layers of the Transformer (Vaswani et al., 2017). Such re- strictions are sub-optimal for sentence-level tasks, and could be very harmful when applying \ufb01ne- tuning based approaches to token-level tasks such as question answering, where it is crucial to incor- porate context from both directions. In this paper, we improve the \ufb01ne-tuning based approaches by proposing BERT: Bidirectional Encoder Representations from Transformers. BERT alleviates the previously mentioned unidi- rectionality constraint by using a \u201cmasked lan- guage model\u201d (MLM) pre-training objective, in- spired by the Cloze task (Taylor, 1953). The masked language model randomly masks some of the tokens from the input, and the objective is to predict the original vocabulary id of the maskedarXiv:1810.04805v2 [cs.CL] 24 May 2019word based only on its context. Unlike left-to- right language model pre-training, the MLM ob- jective enables the representation to fuse the left and the right context, which allows us to pre- train a deep bidirectional Transformer. In addi- tion to the masked language model, we also use a \u201cnext sentence prediction\u201d task that jointly pre- trains text-pair representations. The contributions of our paper are as follows: \u2022 We demonstrate the importance of bidirectional pre-training for language representations. Un- like Radford et al. (2018), which uses unidirec- tional language models for pre-training, BERT uses masked language models to enable pre- trained deep bidirectional representations. This is also in contrast to Peters et al. (2018a), which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs. \u2022 We show that pre-trained representations reduce the need for many heavily-engineered task- speci\ufb01c architectures. BERT is the \ufb01rst \ufb01ne- tuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level andtoken-level tasks, outper- forming many task-speci\ufb01c architectures. \u2022 BERT advances the state of the art for eleven NLP tasks. The code and pre-trained mod- els are available at https:\/\/github.com\/ google-research\/bert . 2 Related Work There is a long history of pre-training general lan- guage representations, and we brie\ufb02y review the most widely-used approaches in this section. 2.1 Unsupervised Feature-based Approaches Learning widely applicable representations of words has been an active area of research for decades, including non-neural (Brown et al., 1992; Ando and Zhang, 2005; Blitzer et al., 2006) and neural (Mikolov et al., 2013; Pennington et al., 2014) methods. Pre-trained word embeddings are an integral part of modern NLP systems, of- fering signi\ufb01cant improvements over embeddings learned from scratch (Turian et al., 2010). To pre- train word embedding vectors, left-to-right lan- guage modeling objectives have been used (Mnih and Hinton, 2009), as well as objectives to dis- criminate correct from incorrect words in left and right context (Mikolov et al., 2013).These approaches have been generalized to coarser granularities, such as sentence embed- dings (Kiros et al., 2015; Logeswaran and Lee, 2018) or paragraph embeddings (Le and Mikolov, 2014). To train sentence representations, prior work has used objectives to rank candidate next sentences (Jernite et al., 2017; Logeswaran and Lee, 2018), left-to-right generation of next sen- tence words given a representation of the previous sentence (Kiros et al., 2015), or denoising auto- encoder","the token-level classi\ufb01er over the NER label set. To ablate the \ufb01ne-tuning approach, we apply the feature-based approach by extracting the activa- tions from one or more layers without \ufb01ne-tuning any parameters of BERT. These contextual em- beddings are used as input to a randomly initial- ized two-layer 768-dimensional BiLSTM before the classi\ufb01cation layer. Results are presented in Table 7. BERT LARGE performs competitively with state-of-the-art meth- ods. The best performing method concatenates the token representations from the top four hidden lay- ers of the pre-trained Transformer, which is only 0.3 F1 behind \ufb01ne-tuning the entire model. This demonstrates that BERT is effective for both \ufb01ne- tuning and feature-based approaches. 6 Conclusion Recent empirical improvements due to transfer learning with language models have demonstrated that rich, unsupervised pre-training is an integral part of many language understanding systems. In particular, these results enable even low-resource tasks to bene\ufb01t from deep unidirectional architec- tures. Our major contribution is further general- izing these \ufb01ndings to deep bidirectional architec- tures, allowing the same pre-trained model to suc- cessfully tackle a broad set of NLP tasks.References Alan Akbik, Duncan Blythe, and Roland V ollgraf. 2018. Contextual string embeddings for sequence labeling. In Proceedings of the 27th International Conference on Computational Linguistics , pages 1638\u20131649. Rami Al-Rfou, Dokook Choe, Noah Constant, Mandy Guo, and Llion Jones. 2018. Character-level lan- guage modeling with deeper self-attention. arXiv preprint arXiv:1808.04444 . Rie Kubota Ando and Tong Zhang. 2005. A framework for learning predictive structures from multiple tasks and unlabeled data. Journal of Machine Learning Research , 6(Nov):1817\u20131853. Luisa Bentivogli, Bernardo Magnini, Ido Dagan, Hoa Trang Dang, and Danilo Giampiccolo. 2009. The \ufb01fth PASCAL recognizing textual entailment challenge. In TAC. NIST. John Blitzer, Ryan McDonald, and Fernando Pereira. 2006. Domain adaptation with structural correspon- dence learning. In Proceedings of the 2006 confer- ence on empirical methods in natural language pro- cessing , pages 120\u2013128. Association for Computa- tional Linguistics. Samuel R. Bowman, Gabor Angeli, Christopher Potts, and Christopher D. Manning. 2015. A large anno- tated corpus for learning natural language inference. InEMNLP . Association for Computational Linguis- tics. Peter F Brown, Peter V Desouza, Robert L Mercer, Vincent J Della Pietra, and Jenifer C Lai. 1992. Class-based n-gram models of natural language. Computational linguistics , 18(4):467\u2013479. Daniel Cer, Mona Diab, Eneko Agirre, Inigo Lopez- Gazpio, and Lucia Specia. 2017. Semeval-2017 task 1: Semantic textual similarity multilingual and crosslingual focused evaluation. In Proceedings of the 11th International Workshop on Semantic Evaluation (SemEval-2017) , pages 1\u201314, Vancou- ver, Canada. Association for Computational Lin- guistics. Ciprian Chelba, Tomas Mikolov, Mike Schuster, Qi Ge, Thorsten Brants, Phillipp Koehn, and Tony Robin- son. 2013. One billion word benchmark for measur- ing progress in statistical language modeling. arXiv preprint arXiv:1312.3005 . Z. Chen, H. Zhang, X. Zhang, and L. Zhao. 2018. Quora question pairs. Christopher Clark and Matt Gardner. 2018. Simple and effective multi-paragraph reading comprehen- sion. In ACL.Kevin Clark, Minh-Thang Luong, Christopher D Man- ning, and Quoc Le. 2018. Semi-supervised se- quence modeling with cross-view training. In Pro- ceedings of the 2018 Conference on Empirical Meth- ods"]}
\ No newline at end of file
diff --git a/examples/flows/evaluation/groundedness-eval/flow.dag.yaml b/examples/flows/evaluation/groundedness-eval/flow.dag.yaml
new file mode 100644
index 00000000000..e631eec677f
--- /dev/null
+++ b/examples/flows/evaluation/groundedness-eval/flow.dag.yaml
@@ -0,0 +1,261 @@
+environment:
+  python_requirements_txt: requirements.txt
+inputs:
+  question:
+    type: string
+    default: What is the name of the new language representation model introduced in
+      the document?
+  answer:
+    type: string
+    default: The document mentions multiple language representation models, so it is
+      unclear which one is being referred to as \"new\". Can you provide more
+      specific information or context?
+  context:
+    type: string
+    default: '["statistical language modeling. arXiv preprint arXiv:1312.3005 . Z.
+      Chen, H. Zhang, X. Zhang, and L. Zhao. 2018. Quora question pairs.
+      Christopher Clark and Matt Gardner. 2018. Simple and effective
+      multi-paragraph reading comprehen- sion. In ACL.Kevin Clark, Minh-Thang
+      Luong, Christopher D Man- ning, and Quoc Le. 2018. Semi-supervised se-
+      quence modeling with cross-view training. In Pro- ceedings of the 2018
+      Conference on Empirical Meth- ods in Natural Language Processing , pages
+      1914\u2013 1925. Ronan Collobert and Jason Weston. 2008. A uni\ufb01ed
+      architecture for natural language processing: Deep neural networks with
+      multitask learning. In Pro- ceedings of the 25th international conference
+      on Machine learning , pages 160\u2013167. ACM. Alexis Conneau, Douwe
+      Kiela, Holger Schwenk, Lo \u00a8\u0131c Barrault, and Antoine Bordes.
+      2017. Supervised learning of universal sentence representations from
+      natural language inference data. In Proceedings of the 2017 Conference on
+      Empirical Methods in Nat- ural Language Processing , pages 670\u2013680,
+      Copen- hagen, Denmark. Association for Computational Linguistics. Andrew M
+      Dai and Quoc V Le. 2015. Semi-supervised sequence learning. In Advances in
+      neural informa- tion processing systems , pages 3079\u20133087. J. Deng,
+      W. Dong, R. Socher, L.-J. Li, K. Li, and L. Fei- Fei. 2009. ImageNet: A
+      Large-Scale Hierarchical Image Database. In CVPR09 . William B Dolan and
+      Chris Brockett. 2005. Automati- cally constructing a corpus of sentential
+      paraphrases. InProceedings of the Third International Workshop on
+      Paraphrasing (IWP2005) . William Fedus, Ian Goodfellow, and Andrew M Dai.
+      2018. Maskgan: Better text generation via \ufb01lling in the.arXiv
+      preprint arXiv:1801.07736 . Dan Hendrycks and Kevin Gimpel. 2016. Bridging
+      nonlinearities and stochastic regularizers with gaus- sian error linear
+      units. CoRR , abs\/1606.08415. Felix Hill, Kyunghyun Cho, and Anna
+      Korhonen. 2016. Learning distributed representations of sentences from
+      unlabelled data. In Proceedings of the 2016 Conference of the North
+      American Chapter of the Association for Computational Linguistics: Human
+      Language Technologies . Association for Computa- tional Linguistics.
+      Jeremy Howard and Sebastian Ruder. 2018. Universal language model
+      \ufb01ne-tuning for text classi\ufb01cation. In ACL. Association for
+      Computational Linguistics. Minghao Hu, Yuxing Peng, Zhen Huang, Xipeng
+      Qiu, Furu Wei, and Ming Zhou. 2018. Reinforced mnemonic reader for machine
+      reading comprehen- sion. In IJCAI . Yacine Jernite, Samuel R. Bowman, and
+      David Son- tag. 2017. Discourse-based objectives for fast un- supervised
+      sentence representation learning. CoRR , abs\/1705.00557.Mandar Joshi,
+      Eunsol Choi, Daniel S Weld, and Luke Zettlemoyer. 2017. Triviaqa: A large
+      scale distantly supervised challenge dataset for reading comprehen- sion.
+      In ACL. Ryan Kiros, Yukun Zhu, Ruslan R Salakhutdinov, Richard Zemel,
+      Raquel Urtasun, Antonio Torralba, and Sanja Fidler. 2015. Skip-thought
+      vectors. In Advances in neural information processing systems , pages
+      3294\u20133302. Quoc Le and Tomas Mikolov. 2014. Distributed rep-
+      resentations of sentences and documents. In Inter- national Conference on
+      Machine Learning , pages 1188\u20131196. Hector J Levesque, Ernest Davis,
+      and Leora Morgen- stern. 2011. The winograd schema challenge. In Aaai
+      spring symposium: Logical formalizations of commonsense reasoning , volume
+      46, page 47. Lajanugen Logeswaran and Honglak Lee. 2018. An ef\ufb01cient
+      framework for learning sentence represen- tations. In International
+      Conference on Learning Representations . Bryan McCann, James Bradbury,
+      Caiming Xiong, and Richard Socher. 2017. Learned in translation:
+      Con-","tool for measuring readability. Journalism Bulletin ,
+      30(4):415\u2013433. Erik F Tjong Kim Sang and Fien De Meulder. 2003.
+      Introduction to the conll-2003 shared task: Language-independent named
+      entity recognition. In CoNLL . Joseph Turian, Lev Ratinov, and Yoshua
+      Bengio. 2010. Word representations: A simple and general method for
+      semi-supervised learning. In Proceedings of the 48th Annual Meeting of the
+      Association for Compu- tational Linguistics , ACL \u201910, pages
+      384\u2013394. Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+      Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017.
+      Attention is all you need. In Advances in Neural Information Pro- cessing
+      Systems , pages 6000\u20136010. Pascal Vincent, Hugo Larochelle, Yoshua
+      Bengio, and Pierre-Antoine Manzagol. 2008. Extracting and composing robust
+      features with denoising autoen- coders. In Proceedings of the 25th
+      international conference on Machine learning , pages 1096\u20131103. ACM.
+      Alex Wang, Amanpreet Singh, Julian Michael, Fe- lix Hill, Omer Levy, and
+      Samuel Bowman. 2018a. Glue: A multi-task benchmark and analysis
+      platformfor natural language understanding. In Proceedings of the 2018
+      EMNLP Workshop BlackboxNLP: An- alyzing and Interpreting Neural Networks
+      for NLP , pages 353\u2013355. Wei Wang, Ming Yan, and Chen Wu. 2018b.
+      Multi- granularity hierarchical attention fusion networks for reading
+      comprehension and question answering. InProceedings of the 56th Annual
+      Meeting of the As- sociation for Computational Linguistics (Volume 1: Long
+      Papers) . Association for Computational Lin- guistics. Alex Warstadt,
+      Amanpreet Singh, and Samuel R Bow- man. 2018. Neural network acceptability
+      judg- ments. arXiv preprint arXiv:1805.12471 . Adina Williams, Nikita
+      Nangia, and Samuel R Bow- man. 2018. A broad-coverage challenge corpus for
+      sentence understanding through inference. In NAACL . Yonghui Wu, Mike
+      Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey,
+      Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. 2016.
+      Google\u2019s neural ma- chine translation system: Bridging the gap
+      between human and machine translation. arXiv preprint arXiv:1609.08144 .
+      Jason Yosinski, Jeff Clune, Yoshua Bengio, and Hod Lipson. 2014. How
+      transferable are features in deep neural networks? In Advances in neural
+      information processing systems , pages 3320\u20133328. Adams Wei Yu, David
+      Dohan, Minh-Thang Luong, Rui Zhao, Kai Chen, Mohammad Norouzi, and Quoc V
+      Le. 2018. QANet: Combining local convolution with global self-attention
+      for reading comprehen- sion. In ICLR . Rowan Zellers, Yonatan Bisk, Roy
+      Schwartz, and Yejin Choi. 2018. Swag: A large-scale adversarial dataset
+      for grounded commonsense inference. In Proceed- ings of the 2018
+      Conference on Empirical Methods in Natural Language Processing (EMNLP) .
+      Yukun Zhu, Ryan Kiros, Rich Zemel, Ruslan Salakhut- dinov, Raquel Urtasun,
+      Antonio Torralba, and Sanja Fidler. 2015. Aligning books and movies:
+      Towards story-like visual explanations by watching movies and reading
+      books. In Proceedings of the IEEE international conference on computer
+      vision , pages 19\u201327. Appendix for \u201cBERT: Pre-training of Deep
+      Bidirectional Transformers for Language Understanding\u201d We organize
+      the appendix into three sections: \u2022 Additional implementation details
+      for BERT are presented in Appendix A;\u2022 Additional details for our
+      experiments are presented in Appendix B; and \u2022 Additional ablation
+      studies are presented in Appendix C. We present additional ablation
+      studies for BERT including: \u2013Effect of Number of Training Steps; and
+      \u2013Ablation for Different"]} {"question": "What is the main difference
+      between BERT and previous language representation models?", "variant_id":
+      "v1", "line_number": 2, answer":"BERT is designed to pre-train deep
+      bidirectional representations from unlabeled text by jointly conditioning
+      on both left and right context in all layers, allowing it to incorporate
+      context from both directions. This is unlike previous language
+      representation models that are unidirectional, which limits the choice of
+      architectures that can be used during pre-training and could be
+      sub-optimal for sentence-level tasks and token-level tasks such as
+      question answering.","context":["BERT: Pre-training of Deep Bidirectional
+      Transformers for Language Understanding Jacob Devlin Ming-Wei Chang Kenton
+      Lee Kristina Toutanova Google AI Language
+      fjacobdevlin,mingweichang,kentonl,kristout g@google.com Abstract We
+      introduce a new language representa- tion model called BERT , which stands
+      for Bidirectional Encoder Representations from Transformers. Unlike recent
+      language repre- sentation models (Peters et al., 2018a; Rad- ford et al.,
+      2018), BERT is designed to pre- train deep bidirectional representations
+      from unlabeled text by jointly conditioning on both left and right context
+      in all layers. As a re- sult, the pre-trained BERT model can be \ufb01ne-
+      tuned with just one additional output layer to create state-of-the-art
+      models for a wide range of tasks, such as question answering and language
+      inference, without substantial task- speci\ufb01c architecture
+      modi\ufb01cations. BERT is conceptually simple and empirically powerful.
+      It obtains new state-of-the-art re- sults on eleven natural language
+      processing tasks, including pushing the GLUE score to 80.5% (7.7% point
+      absolute improvement), MultiNLI accuracy to 86.7% (4.6% absolute
+      improvement), SQuAD v1.1 question answer- ing Test F1 to 93.2 (1.5 point
+      absolute im- provement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute
+      improvement). 1 Introduction Language model pre-training has been shown to
+      be effective for improving many natural language processing tasks (Dai and
+      Le, 2015; Peters et al., 2018a; Radford et al., 2018; Howard and Ruder,
+      2018). These include sentence-level tasks such as natural language
+      inference (Bowman et al., 2015; Williams et al., 2018) and paraphrasing
+      (Dolan and Brockett, 2005), which aim to predict the re- lationships
+      between sentences by analyzing them holistically, as well as token-level
+      tasks such as named entity recognition and question answering, where
+      models are required to produce \ufb01ne-grained output at the token level
+      (Tjong Kim Sang and De Meulder, 2003; Rajpurkar et al., 2016).There are
+      two existing strategies for apply- ing pre-trained language
+      representations to down- stream tasks: feature-based and\ufb01ne-tuning .
+      The feature-based approach, such as ELMo (Peters et al., 2018a), uses
+      task-speci\ufb01c architectures that include the pre-trained
+      representations as addi- tional features. The \ufb01ne-tuning approach,
+      such as the Generative Pre-trained Transformer (OpenAI GPT) (Radford et
+      al., 2018), introduces minimal task-speci\ufb01c parameters, and is
+      trained on the downstream tasks by simply \ufb01ne-tuning allpre- trained
+      parameters. The two approaches share the same objective function during
+      pre-training, where they use unidirectional language models to learn
+      general language representations. We argue that current techniques
+      restrict the power of the pre-trained representations, espe- cially for
+      the \ufb01ne-tuning approaches. The ma- jor limitation is that standard
+      language models are unidirectional, and this limits the choice of archi-
+      tectures that can be used during pre-training. For example, in OpenAI GPT,
+      the authors use a left-to- right architecture, where every token can only
+      at- tend to previous tokens in the self-attention layers of the
+      Transformer (Vaswani et al., 2017). Such re- strictions are sub-optimal
+      for sentence-level tasks, and could be very harmful when applying
+      \ufb01ne- tuning based approaches to token-level tasks such as question
+      answering, where it is crucial to incor- porate context from both
+      directions. In this paper, we improve the \ufb01ne-tuning based approaches
+      by proposing BERT: Bidirectional Encoder Representations from
+      Transformers.","the self-attention layers of the Transformer (Vaswani et
+      al., 2017). Such re- strictions are sub-optimal for sentence-level tasks,
+      and could be very harmful when applying \ufb01ne- tuning based approaches
+      to token-level tasks such as question answering, where it is crucial to
+      incor- porate context from both directions. In this paper, we improve the
+      \ufb01ne-tuning based approaches by proposing BERT: Bidirectional Encoder
+      Representations from Transformers. BERT alleviates the previously
+      mentioned unidi- rectionality constraint by using a \u201cmasked lan-
+      guage model\u201d (MLM) pre-training objective, in- spired by the Cloze
+      task (Taylor, 1953). The masked language model randomly masks some of the
+      tokens from the input, and the objective is to predict the original
+      vocabulary id of the maskedarXiv:1810.04805v2 [cs.CL] 24 May 2019word
+      based only on its context. Unlike left-to- right language model
+      pre-training, the MLM ob- jective enables the representation to fuse the
+      left and the right context, which allows us to pre- train a deep
+      bidirectional Transformer. In addi- tion to the masked language model, we
+      also use a \u201cnext sentence prediction\u201d task that jointly pre-
+      trains text-pair representations. The contributions of our paper are as
+      follows: \u2022 We demonstrate the importance of bidirectional
+      pre-training for language representations. Un- like Radford et al. (2018),
+      which uses unidirec- tional language models for pre-training, BERT uses
+      masked language models to enable pre- trained deep bidirectional
+      representations. This is also in contrast to Peters et al. (2018a), which
+      uses a shallow concatenation of independently trained left-to-right and
+      right-to-left LMs. \u2022 We show that pre-trained representations reduce
+      the need for many heavily-engineered task- speci\ufb01c architectures.
+      BERT is the \ufb01rst \ufb01ne- tuning based representation model that
+      achieves state-of-the-art performance on a large suite of sentence-level
+      andtoken-level tasks, outper- forming many task-speci\ufb01c
+      architectures. \u2022 BERT advances the state of the art for eleven NLP
+      tasks. The code and pre-trained mod- els are available at
+      https:\/\/github.com\/ google-research\/bert . 2 Related Work There is a
+      long history of pre-training general lan- guage representations, and we
+      brie\ufb02y review the most widely-used approaches in this section. 2.1
+      Unsupervised Feature-based Approaches Learning widely applicable
+      representations of words has been an active area of research for decades,
+      including non-neural (Brown et al., 1992; Ando and Zhang, 2005; Blitzer et
+      al., 2006) and neural (Mikolov et al., 2013; Pennington et al., 2014)
+      methods. Pre-trained word embeddings are an integral part of modern NLP
+      systems, of- fering signi\ufb01cant improvements over embeddings learned
+      from scratch (Turian et al., 2010). To pre- train word embedding vectors,
+      left-to-right lan- guage modeling objectives have been used (Mnih and
+      Hinton, 2009), as well as objectives to dis- criminate correct from
+      incorrect words in left and right context (Mikolov et al., 2013).These
+      approaches have been generalized to coarser granularities, such as
+      sentence embed- dings (Kiros et al., 2015; Logeswaran and Lee, 2018) or
+      paragraph embeddings (Le and Mikolov, 2014). "]'
+outputs:
+  groundedness:
+    type: string
+    reference: ${parse_score.output}
+nodes:
+- name: parse_score
+  type: python
+  source:
+    type: code
+    path: calc_groundedness.py
+  inputs:
+    gpt_score: ${gpt_groundedness.output}
+- name: aggregate
+  type: python
+  source:
+    type: code
+    path: aggregate.py
+  inputs:
+    groundedness_scores: ${parse_score.output}
+  aggregation: true
+- name: gpt_groundedness
+  type: llm
+  source:
+    type: code
+    path: gpt_groundedness.md
+  inputs:
+    deployment_name: gpt-35-turbo
+    max_tokens: 5
+    answer: ${inputs.answer}
+    question: ${inputs.question}
+    context: ${inputs.context}
+    temperature: 0
+  provider: AzureOpenAI
+  connection: azure_open_ai_connection
+  api: chat
+  module: promptflow.tools.aoai
diff --git a/examples/flows/evaluation/groundedness-eval/gpt_groundedness.md b/examples/flows/evaluation/groundedness-eval/gpt_groundedness.md
new file mode 100644
index 00000000000..b72f45b0e46
--- /dev/null
+++ b/examples/flows/evaluation/groundedness-eval/gpt_groundedness.md
@@ -0,0 +1,22 @@
+user:
+# Instructions
+
+* There are many chatbots that can answer users questions based on the context given from different sources like search results, or snippets from books/papers. They try to understand users's question and then get context by either performing search from search engines, databases or books/papers for relevant content. Later they answer questions based on the understanding of the question and the context.
+* Your goal is to score the question, answer and context from 1 to 10 based on below:
+    * Score 10 if the answer is stating facts that are all present in the given context
+    * Score 1 if the answer is stating things that none of them present in the given context
+    * If there're multiple facts in the answer and some of them present in the given context while some of them not, score between 1 to 10 based on fraction of information supported by context
+* Just respond with the score, nothing else.
+  
+# Real work
+
+## Question
+{{question}}
+
+## Answer
+{{answer}}
+
+## Context
+{{context}}
+
+## Score
\ No newline at end of file
diff --git a/examples/flows/evaluation/groundedness-eval/requirements.txt b/examples/flows/evaluation/groundedness-eval/requirements.txt
new file mode 100644
index 00000000000..16306751554
--- /dev/null
+++ b/examples/flows/evaluation/groundedness-eval/requirements.txt
@@ -0,0 +1,3 @@
+--extra-index-url https://azuremlsdktestpypi.azureedge.net/promptflow/
+promptflow
+promptflow-tools
\ No newline at end of file
diff --git a/examples/flows/evaluation/perceived-intelligence-eval/README.md b/examples/flows/evaluation/perceived-intelligence-eval/README.md
new file mode 100644
index 00000000000..b66f57b0e26
--- /dev/null
+++ b/examples/flows/evaluation/perceived-intelligence-eval/README.md
@@ -0,0 +1,31 @@
+# Groundedness Evaluation
+
+This is a flow leverage llm to eval percieved intelligence.
+Perceived intelligence is the degree to which a bot can impress the user with its responses, by showing originality, insight, creativity, knowledge, and adaptability.
+
+Tools used in this flow：
+- `python` tool
+- built-in `llm` tool
+
+### 0. Setup connection
+
+Prepare your Azure Open AI resource follow this [instruction](https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal) and get your `api_key` if you don't have one.
+
+```bash
+# Override keys with --set to avoid yaml file changes
+pf connection create --file ../../../connections/azure_openai.yml --set api_key=<your_api_key> api_base=<your_api_base>
+```
+
+### 1. Test flow/node
+
+```bash
+# test with default input value in flow.dag.yaml
+pf flow test --flow .
+```
+
+### 2. create flow run with multi line data
+
+```bash
+pf run create --flow . --data ./data.jsonl --stream
+```
+
diff --git a/examples/flows/evaluation/perceived-intelligence-eval/aggregate.py b/examples/flows/evaluation/perceived-intelligence-eval/aggregate.py
new file mode 100644
index 00000000000..a73f3f29845
--- /dev/null
+++ b/examples/flows/evaluation/perceived-intelligence-eval/aggregate.py
@@ -0,0 +1,21 @@
+from typing import List
+from promptflow import tool
+
+
+@tool
+def aggregate(perceived_intelligence_score: List[float]):
+    aggregated_results = {"perceived_intelligence_score": 0.0, "count": 0}
+
+    # Calculate average perceived_intelligence_score
+    for i in range(len(perceived_intelligence_score)):
+        aggregated_results["perceived_intelligence_score"] += perceived_intelligence_score[i]
+        aggregated_results["count"] += 1
+
+    aggregated_results["perceived_intelligence_score"] /= aggregated_results["count"]
+
+    # Log metric for each variant
+    from promptflow import log_metric
+
+    log_metric(key="perceived_intelligence_score", value=aggregated_results["perceived_intelligence_score"])
+
+    return aggregated_results
diff --git a/examples/flows/evaluation/perceived-intelligence-eval/data.jsonl b/examples/flows/evaluation/perceived-intelligence-eval/data.jsonl
new file mode 100644
index 00000000000..66aaf8cb27a
--- /dev/null
+++ b/examples/flows/evaluation/perceived-intelligence-eval/data.jsonl
@@ -0,0 +1,2 @@
+{"question": "What is the name of the new language representation model introduced in the document?", "variant_id": "v1", "line_number":1, "answer":"The document mentions multiple language representation models, so it is unclear which one is being referred to as \"new\". Can you provide more specific information or context?","context":["statistical language modeling. arXiv preprint arXiv:1312.3005 . Z. Chen, H. Zhang, X. Zhang, and L. Zhao. 2018. Quora question pairs. Christopher Clark and Matt Gardner. 2018. Simple and effective multi-paragraph reading comprehen- sion. In ACL.Kevin Clark, Minh-Thang Luong, Christopher D Man- ning, and Quoc Le. 2018. Semi-supervised se- quence modeling with cross-view training. In Pro- ceedings of the 2018 Conference on Empirical Meth- ods in Natural Language Processing , pages 1914\u2013 1925. Ronan Collobert and Jason Weston. 2008. A uni\ufb01ed architecture for natural language processing: Deep neural networks with multitask learning. In Pro- ceedings of the 25th international conference on Machine learning , pages 160\u2013167. ACM. Alexis Conneau, Douwe Kiela, Holger Schwenk, Lo \u00a8\u0131c Barrault, and Antoine Bordes. 2017. Supervised learning of universal sentence representations from natural language inference data. In Proceedings of the 2017 Conference on Empirical Methods in Nat- ural Language Processing , pages 670\u2013680, Copen- hagen, Denmark. Association for Computational Linguistics. Andrew M Dai and Quoc V Le. 2015. Semi-supervised sequence learning. In Advances in neural informa- tion processing systems , pages 3079\u20133087. J. Deng, W. Dong, R. Socher, L.-J. Li, K. Li, and L. Fei- Fei. 2009. ImageNet: A Large-Scale Hierarchical Image Database. In CVPR09 . William B Dolan and Chris Brockett. 2005. Automati- cally constructing a corpus of sentential paraphrases. InProceedings of the Third International Workshop on Paraphrasing (IWP2005) . William Fedus, Ian Goodfellow, and Andrew M Dai. 2018. Maskgan: Better text generation via \ufb01lling in the.arXiv preprint arXiv:1801.07736 . Dan Hendrycks and Kevin Gimpel. 2016. Bridging nonlinearities and stochastic regularizers with gaus- sian error linear units. CoRR , abs\/1606.08415. Felix Hill, Kyunghyun Cho, and Anna Korhonen. 2016. Learning distributed representations of sentences from unlabelled data. In Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies . Association for Computa- tional Linguistics. Jeremy Howard and Sebastian Ruder. 2018. Universal language model \ufb01ne-tuning for text classi\ufb01cation. In ACL. Association for Computational Linguistics. Minghao Hu, Yuxing Peng, Zhen Huang, Xipeng Qiu, Furu Wei, and Ming Zhou. 2018. Reinforced mnemonic reader for machine reading comprehen- sion. In IJCAI . Yacine Jernite, Samuel R. Bowman, and David Son- tag. 2017. Discourse-based objectives for fast un- supervised sentence representation learning. CoRR , abs\/1705.00557.Mandar Joshi, Eunsol Choi, Daniel S Weld, and Luke Zettlemoyer. 2017. Triviaqa: A large scale distantly supervised challenge dataset for reading comprehen- sion. In ACL. Ryan Kiros, Yukun Zhu, Ruslan R Salakhutdinov, Richard Zemel, Raquel Urtasun, Antonio Torralba, and Sanja Fidler. 2015. Skip-thought vectors. In Advances in neural information processing systems , pages 3294\u20133302. Quoc Le and Tomas Mikolov. 2014. Distributed rep- resentations of sentences and documents. In Inter- national Conference on Machine Learning , pages 1188\u20131196. Hector J Levesque, Ernest Davis, and Leora Morgen- stern. 2011. The winograd schema challenge. In Aaai spring symposium: Logical formalizations of commonsense reasoning , volume 46, page 47. Lajanugen Logeswaran and Honglak Lee. 2018. An ef\ufb01cient framework for learning sentence represen- tations. In International Conference on Learning Representations . Bryan McCann, James Bradbury, Caiming Xiong, and Richard Socher. 2017. Learned in translation: Con-","tool for measuring readability. Journalism Bulletin , 30(4):415\u2013433. Erik F Tjong Kim Sang and Fien De Meulder. 2003. Introduction to the conll-2003 shared task: Language-independent named entity recognition. In CoNLL . Joseph Turian, Lev Ratinov, and Yoshua Bengio. 2010. Word representations: A simple and general method for semi-supervised learning. In Proceedings of the 48th Annual Meeting of the Association for Compu- tational Linguistics , ACL \u201910, pages 384\u2013394. Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information Pro- cessing Systems , pages 6000\u20136010. Pascal Vincent, Hugo Larochelle, Yoshua Bengio, and Pierre-Antoine Manzagol. 2008. Extracting and composing robust features with denoising autoen- coders. In Proceedings of the 25th international conference on Machine learning , pages 1096\u20131103. ACM. Alex Wang, Amanpreet Singh, Julian Michael, Fe- lix Hill, Omer Levy, and Samuel Bowman. 2018a. Glue: A multi-task benchmark and analysis platformfor natural language understanding. In Proceedings of the 2018 EMNLP Workshop BlackboxNLP: An- alyzing and Interpreting Neural Networks for NLP , pages 353\u2013355. Wei Wang, Ming Yan, and Chen Wu. 2018b. Multi- granularity hierarchical attention fusion networks for reading comprehension and question answering. InProceedings of the 56th Annual Meeting of the As- sociation for Computational Linguistics (Volume 1: Long Papers) . Association for Computational Lin- guistics. Alex Warstadt, Amanpreet Singh, and Samuel R Bow- man. 2018. Neural network acceptability judg- ments. arXiv preprint arXiv:1805.12471 . Adina Williams, Nikita Nangia, and Samuel R Bow- man. 2018. A broad-coverage challenge corpus for sentence understanding through inference. In NAACL . Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. 2016. Google\u2019s neural ma- chine translation system: Bridging the gap between human and machine translation. arXiv preprint arXiv:1609.08144 . Jason Yosinski, Jeff Clune, Yoshua Bengio, and Hod Lipson. 2014. How transferable are features in deep neural networks? In Advances in neural information processing systems , pages 3320\u20133328. Adams Wei Yu, David Dohan, Minh-Thang Luong, Rui Zhao, Kai Chen, Mohammad Norouzi, and Quoc V Le. 2018. QANet: Combining local convolution with global self-attention for reading comprehen- sion. In ICLR . Rowan Zellers, Yonatan Bisk, Roy Schwartz, and Yejin Choi. 2018. Swag: A large-scale adversarial dataset for grounded commonsense inference. In Proceed- ings of the 2018 Conference on Empirical Methods in Natural Language Processing (EMNLP) . Yukun Zhu, Ryan Kiros, Rich Zemel, Ruslan Salakhut- dinov, Raquel Urtasun, Antonio Torralba, and Sanja Fidler. 2015. Aligning books and movies: Towards story-like visual explanations by watching movies and reading books. In Proceedings of the IEEE international conference on computer vision , pages 19\u201327. Appendix for \u201cBERT: Pre-training of Deep Bidirectional Transformers for Language Understanding\u201d We organize the appendix into three sections: \u2022 Additional implementation details for BERT are presented in Appendix A;\u2022 Additional details for our experiments are presented in Appendix B; and \u2022 Additional ablation studies are presented in Appendix C. We present additional ablation studies for BERT including: \u2013Effect of Number of Training Steps; and \u2013Ablation for Different"]}
+{"question": "What is the main difference between BERT and previous language representation models?", "variant_id": "v1", "line_number": 2, "answer":"BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers, allowing it to incorporate context from both directions. This is unlike previous language representation models that are unidirectional, which limits the choice of architectures that can be used during pre-training and could be sub-optimal for sentence-level tasks and token-level tasks such as question answering.","context":["BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding Jacob Devlin Ming-Wei Chang Kenton Lee Kristina Toutanova Google AI Language fjacobdevlin,mingweichang,kentonl,kristout g@google.com Abstract We introduce a new language representa- tion model called BERT , which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language repre- sentation models (Peters et al., 2018a; Rad- ford et al., 2018), BERT is designed to pre- train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a re- sult, the pre-trained BERT model can be \ufb01ne- tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task- speci\ufb01c architecture modi\ufb01cations. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art re- sults on eleven natural language processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute improvement), MultiNLI accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answer- ing Test F1 to 93.2 (1.5 point absolute im- provement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement). 1 Introduction Language model pre-training has been shown to be effective for improving many natural language processing tasks (Dai and Le, 2015; Peters et al., 2018a; Radford et al., 2018; Howard and Ruder, 2018). These include sentence-level tasks such as natural language inference (Bowman et al., 2015; Williams et al., 2018) and paraphrasing (Dolan and Brockett, 2005), which aim to predict the re- lationships between sentences by analyzing them holistically, as well as token-level tasks such as named entity recognition and question answering, where models are required to produce \ufb01ne-grained output at the token level (Tjong Kim Sang and De Meulder, 2003; Rajpurkar et al., 2016).There are two existing strategies for apply- ing pre-trained language representations to down- stream tasks: feature-based and\ufb01ne-tuning . The feature-based approach, such as ELMo (Peters et al., 2018a), uses task-speci\ufb01c architectures that include the pre-trained representations as addi- tional features. The \ufb01ne-tuning approach, such as the Generative Pre-trained Transformer (OpenAI GPT) (Radford et al., 2018), introduces minimal task-speci\ufb01c parameters, and is trained on the downstream tasks by simply \ufb01ne-tuning allpre- trained parameters. The two approaches share the same objective function during pre-training, where they use unidirectional language models to learn general language representations. We argue that current techniques restrict the power of the pre-trained representations, espe- cially for the \ufb01ne-tuning approaches. The ma- jor limitation is that standard language models are unidirectional, and this limits the choice of archi- tectures that can be used during pre-training. For example, in OpenAI GPT, the authors use a left-to- right architecture, where every token can only at- tend to previous tokens in the self-attention layers of the Transformer (Vaswani et al., 2017). Such re- strictions are sub-optimal for sentence-level tasks, and could be very harmful when applying \ufb01ne- tuning based approaches to token-level tasks such as question answering, where it is crucial to incor- porate context from both directions. In this paper, we improve the \ufb01ne-tuning based approaches by proposing BERT: Bidirectional Encoder Representations from Transformers.","the self-attention layers of the Transformer (Vaswani et al., 2017). Such re- strictions are sub-optimal for sentence-level tasks, and could be very harmful when applying \ufb01ne- tuning based approaches to token-level tasks such as question answering, where it is crucial to incor- porate context from both directions. In this paper, we improve the \ufb01ne-tuning based approaches by proposing BERT: Bidirectional Encoder Representations from Transformers. BERT alleviates the previously mentioned unidi- rectionality constraint by using a \u201cmasked lan- guage model\u201d (MLM) pre-training objective, in- spired by the Cloze task (Taylor, 1953). The masked language model randomly masks some of the tokens from the input, and the objective is to predict the original vocabulary id of the maskedarXiv:1810.04805v2 [cs.CL] 24 May 2019word based only on its context. Unlike left-to- right language model pre-training, the MLM ob- jective enables the representation to fuse the left and the right context, which allows us to pre- train a deep bidirectional Transformer. In addi- tion to the masked language model, we also use a \u201cnext sentence prediction\u201d task that jointly pre- trains text-pair representations. The contributions of our paper are as follows: \u2022 We demonstrate the importance of bidirectional pre-training for language representations. Un- like Radford et al. (2018), which uses unidirec- tional language models for pre-training, BERT uses masked language models to enable pre- trained deep bidirectional representations. This is also in contrast to Peters et al. (2018a), which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs. \u2022 We show that pre-trained representations reduce the need for many heavily-engineered task- speci\ufb01c architectures. BERT is the \ufb01rst \ufb01ne- tuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level andtoken-level tasks, outper- forming many task-speci\ufb01c architectures. \u2022 BERT advances the state of the art for eleven NLP tasks. The code and pre-trained mod- els are available at https:\/\/github.com\/ google-research\/bert . 2 Related Work There is a long history of pre-training general lan- guage representations, and we brie\ufb02y review the most widely-used approaches in this section. 2.1 Unsupervised Feature-based Approaches Learning widely applicable representations of words has been an active area of research for decades, including non-neural (Brown et al., 1992; Ando and Zhang, 2005; Blitzer et al., 2006) and neural (Mikolov et al., 2013; Pennington et al., 2014) methods. Pre-trained word embeddings are an integral part of modern NLP systems, of- fering signi\ufb01cant improvements over embeddings learned from scratch (Turian et al., 2010). To pre- train word embedding vectors, left-to-right lan- guage modeling objectives have been used (Mnih and Hinton, 2009), as well as objectives to dis- criminate correct from incorrect words in left and right context (Mikolov et al., 2013).These approaches have been generalized to coarser granularities, such as sentence embed- dings (Kiros et al., 2015; Logeswaran and Lee, 2018) or paragraph embeddings (Le and Mikolov, 2014). To train sentence representations, prior work has used objectives to rank candidate next sentences (Jernite et al., 2017; Logeswaran and Lee, 2018), left-to-right generation of next sen- tence words given a representation of the previous sentence (Kiros et al., 2015), or denoising auto- encoder","the token-level classi\ufb01er over the NER label set. To ablate the \ufb01ne-tuning approach, we apply the feature-based approach by extracting the activa- tions from one or more layers without \ufb01ne-tuning any parameters of BERT. These contextual em- beddings are used as input to a randomly initial- ized two-layer 768-dimensional BiLSTM before the classi\ufb01cation layer. Results are presented in Table 7. BERT LARGE performs competitively with state-of-the-art meth- ods. The best performing method concatenates the token representations from the top four hidden lay- ers of the pre-trained Transformer, which is only 0.3 F1 behind \ufb01ne-tuning the entire model. This demonstrates that BERT is effective for both \ufb01ne- tuning and feature-based approaches. 6 Conclusion Recent empirical improvements due to transfer learning with language models have demonstrated that rich, unsupervised pre-training is an integral part of many language understanding systems. In particular, these results enable even low-resource tasks to bene\ufb01t from deep unidirectional architec- tures. Our major contribution is further general- izing these \ufb01ndings to deep bidirectional architec- tures, allowing the same pre-trained model to suc- cessfully tackle a broad set of NLP tasks.References Alan Akbik, Duncan Blythe, and Roland V ollgraf. 2018. Contextual string embeddings for sequence labeling. In Proceedings of the 27th International Conference on Computational Linguistics , pages 1638\u20131649. Rami Al-Rfou, Dokook Choe, Noah Constant, Mandy Guo, and Llion Jones. 2018. Character-level lan- guage modeling with deeper self-attention. arXiv preprint arXiv:1808.04444 . Rie Kubota Ando and Tong Zhang. 2005. A framework for learning predictive structures from multiple tasks and unlabeled data. Journal of Machine Learning Research , 6(Nov):1817\u20131853. Luisa Bentivogli, Bernardo Magnini, Ido Dagan, Hoa Trang Dang, and Danilo Giampiccolo. 2009. The \ufb01fth PASCAL recognizing textual entailment challenge. In TAC. NIST. John Blitzer, Ryan McDonald, and Fernando Pereira. 2006. Domain adaptation with structural correspon- dence learning. In Proceedings of the 2006 confer- ence on empirical methods in natural language pro- cessing , pages 120\u2013128. Association for Computa- tional Linguistics. Samuel R. Bowman, Gabor Angeli, Christopher Potts, and Christopher D. Manning. 2015. A large anno- tated corpus for learning natural language inference. InEMNLP . Association for Computational Linguis- tics. Peter F Brown, Peter V Desouza, Robert L Mercer, Vincent J Della Pietra, and Jenifer C Lai. 1992. Class-based n-gram models of natural language. Computational linguistics , 18(4):467\u2013479. Daniel Cer, Mona Diab, Eneko Agirre, Inigo Lopez- Gazpio, and Lucia Specia. 2017. Semeval-2017 task 1: Semantic textual similarity multilingual and crosslingual focused evaluation. In Proceedings of the 11th International Workshop on Semantic Evaluation (SemEval-2017) , pages 1\u201314, Vancou- ver, Canada. Association for Computational Lin- guistics. Ciprian Chelba, Tomas Mikolov, Mike Schuster, Qi Ge, Thorsten Brants, Phillipp Koehn, and Tony Robin- son. 2013. One billion word benchmark for measur- ing progress in statistical language modeling. arXiv preprint arXiv:1312.3005 . Z. Chen, H. Zhang, X. Zhang, and L. Zhao. 2018. Quora question pairs. Christopher Clark and Matt Gardner. 2018. Simple and effective multi-paragraph reading comprehen- sion. In ACL.Kevin Clark, Minh-Thang Luong, Christopher D Man- ning, and Quoc Le. 2018. Semi-supervised se- quence modeling with cross-view training. In Pro- ceedings of the 2018 Conference on Empirical Meth- ods"]}
\ No newline at end of file
diff --git a/examples/flows/evaluation/perceived-intelligence-eval/flow.dag.yaml b/examples/flows/evaluation/perceived-intelligence-eval/flow.dag.yaml
new file mode 100644
index 00000000000..62e14452a8c
--- /dev/null
+++ b/examples/flows/evaluation/perceived-intelligence-eval/flow.dag.yaml
@@ -0,0 +1,262 @@
+id: eval_perceived_intelligence
+name: Perceived Intelligence Evaluation Flow
+environment:
+  python_requirements_txt: requirements.txt
+inputs:
+  question:
+    type: string
+    default: What is the name of the new language representation model introduced in
+      the document?
+  answer:
+    type: string
+    default: The document mentions multiple language representation models, so it is
+      unclear which one is being referred to as \"new\". Can you provide more
+      specific information or context?
+  context:
+    type: string
+    default: '["statistical language modeling. arXiv preprint arXiv:1312.3005 . Z.
+      Chen, H. Zhang, X. Zhang, and L. Zhao. 2018. Quora question pairs.
+      Christopher Clark and Matt Gardner. 2018. Simple and effective
+      multi-paragraph reading comprehen- sion. In ACL.Kevin Clark, Minh-Thang
+      Luong, Christopher D Man- ning, and Quoc Le. 2018. Semi-supervised se-
+      quence modeling with cross-view training. In Pro- ceedings of the 2018
+      Conference on Empirical Meth- ods in Natural Language Processing , pages
+      1914\u2013 1925. Ronan Collobert and Jason Weston. 2008. A uni\ufb01ed
+      architecture for natural language processing: Deep neural networks with
+      multitask learning. In Pro- ceedings of the 25th international conference
+      on Machine learning , pages 160\u2013167. ACM. Alexis Conneau, Douwe
+      Kiela, Holger Schwenk, Lo \u00a8\u0131c Barrault, and Antoine Bordes.
+      2017. Supervised learning of universal sentence representations from
+      natural language inference data. In Proceedings of the 2017 Conference on
+      Empirical Methods in Nat- ural Language Processing , pages 670\u2013680,
+      Copen- hagen, Denmark. Association for Computational Linguistics. Andrew M
+      Dai and Quoc V Le. 2015. Semi-supervised sequence learning. In Advances in
+      neural informa- tion processing systems , pages 3079\u20133087. J. Deng,
+      W. Dong, R. Socher, L.-J. Li, K. Li, and L. Fei- Fei. 2009. ImageNet: A
+      Large-Scale Hierarchical Image Database. In CVPR09 . William B Dolan and
+      Chris Brockett. 2005. Automati- cally constructing a corpus of sentential
+      paraphrases. InProceedings of the Third International Workshop on
+      Paraphrasing (IWP2005) . William Fedus, Ian Goodfellow, and Andrew M Dai.
+      2018. Maskgan: Better text generation via \ufb01lling in the.arXiv
+      preprint arXiv:1801.07736 . Dan Hendrycks and Kevin Gimpel. 2016. Bridging
+      nonlinearities and stochastic regularizers with gaus- sian error linear
+      units. CoRR , abs\/1606.08415. Felix Hill, Kyunghyun Cho, and Anna
+      Korhonen. 2016. Learning distributed representations of sentences from
+      unlabelled data. In Proceedings of the 2016 Conference of the North
+      American Chapter of the Association for Computational Linguistics: Human
+      Language Technologies . Association for Computa- tional Linguistics.
+      Jeremy Howard and Sebastian Ruder. 2018. Universal language model
+      \ufb01ne-tuning for text classi\ufb01cation. In ACL. Association for
+      Computational Linguistics. Minghao Hu, Yuxing Peng, Zhen Huang, Xipeng
+      Qiu, Furu Wei, and Ming Zhou. 2018. Reinforced mnemonic reader for machine
+      reading comprehen- sion. In IJCAI . Yacine Jernite, Samuel R. Bowman, and
+      David Son- tag. 2017. Discourse-based objectives for fast un- supervised
+      sentence representation learning. CoRR , abs\/1705.00557.Mandar Joshi,
+      Eunsol Choi, Daniel S Weld, and Luke Zettlemoyer. 2017. Triviaqa: A large
+      scale distantly supervised challenge dataset for reading comprehen- sion.
+      In ACL. Ryan Kiros, Yukun Zhu, Ruslan R Salakhutdinov, Richard Zemel,
+      Raquel Urtasun, Antonio Torralba, and Sanja Fidler. 2015. Skip-thought
+      vectors. In Advances in neural information processing systems , pages
+      3294\u20133302. Quoc Le and Tomas Mikolov. 2014. Distributed rep-
+      resentations of sentences and documents. In Inter- national Conference on
+      Machine Learning , pages 1188\u20131196. Hector J Levesque, Ernest Davis,
+      and Leora Morgen- stern. 2011. The winograd schema challenge. In Aaai
+      spring symposium: Logical formalizations of commonsense reasoning , volume
+      46, page 47. Lajanugen Logeswaran and Honglak Lee. 2018. An ef\ufb01cient
+      framework for learning sentence represen- tations. In International
+      Conference on Learning Representations . Bryan McCann, James Bradbury,
+      Caiming Xiong, and Richard Socher. 2017. Learned in translation:
+      Con-","tool for measuring readability. Journalism Bulletin ,
+      30(4):415\u2013433. Erik F Tjong Kim Sang and Fien De Meulder. 2003.
+      Introduction to the conll-2003 shared task: Language-independent named
+      entity recognition. In CoNLL . Joseph Turian, Lev Ratinov, and Yoshua
+      Bengio. 2010. Word representations: A simple and general method for
+      semi-supervised learning. In Proceedings of the 48th Annual Meeting of the
+      Association for Compu- tational Linguistics , ACL \u201910, pages
+      384\u2013394. Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+      Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017.
+      Attention is all you need. In Advances in Neural Information Pro- cessing
+      Systems , pages 6000\u20136010. Pascal Vincent, Hugo Larochelle, Yoshua
+      Bengio, and Pierre-Antoine Manzagol. 2008. Extracting and composing robust
+      features with denoising autoen- coders. In Proceedings of the 25th
+      international conference on Machine learning , pages 1096\u20131103. ACM.
+      Alex Wang, Amanpreet Singh, Julian Michael, Fe- lix Hill, Omer Levy, and
+      Samuel Bowman. 2018a. Glue: A multi-task benchmark and analysis
+      platformfor natural language understanding. In Proceedings of the 2018
+      EMNLP Workshop BlackboxNLP: An- alyzing and Interpreting Neural Networks
+      for NLP , pages 353\u2013355. Wei Wang, Ming Yan, and Chen Wu. 2018b.
+      Multi- granularity hierarchical attention fusion networks for reading
+      comprehension and question answering. InProceedings of the 56th Annual
+      Meeting of the As- sociation for Computational Linguistics (Volume 1: Long
+      Papers) . Association for Computational Lin- guistics. Alex Warstadt,
+      Amanpreet Singh, and Samuel R Bow- man. 2018. Neural network acceptability
+      judg- ments. arXiv preprint arXiv:1805.12471 . Adina Williams, Nikita
+      Nangia, and Samuel R Bow- man. 2018. A broad-coverage challenge corpus for
+      sentence understanding through inference. In NAACL . Yonghui Wu, Mike
+      Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey,
+      Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. 2016.
+      Google\u2019s neural ma- chine translation system: Bridging the gap
+      between human and machine translation. arXiv preprint arXiv:1609.08144 .
+      Jason Yosinski, Jeff Clune, Yoshua Bengio, and Hod Lipson. 2014. How
+      transferable are features in deep neural networks? In Advances in neural
+      information processing systems , pages 3320\u20133328. Adams Wei Yu, David
+      Dohan, Minh-Thang Luong, Rui Zhao, Kai Chen, Mohammad Norouzi, and Quoc V
+      Le. 2018. QANet: Combining local convolution with global self-attention
+      for reading comprehen- sion. In ICLR . Rowan Zellers, Yonatan Bisk, Roy
+      Schwartz, and Yejin Choi. 2018. Swag: A large-scale adversarial dataset
+      for grounded commonsense inference. In Proceed- ings of the 2018
+      Conference on Empirical Methods in Natural Language Processing (EMNLP) .
+      Yukun Zhu, Ryan Kiros, Rich Zemel, Ruslan Salakhut- dinov, Raquel Urtasun,
+      Antonio Torralba, and Sanja Fidler. 2015. Aligning books and movies:
+      Towards story-like visual explanations by watching movies and reading
+      books. In Proceedings of the IEEE international conference on computer
+      vision , pages 19\u201327. Appendix for \u201cBERT: Pre-training of Deep
+      Bidirectional Transformers for Language Understanding\u201d We organize
+      the appendix into three sections: \u2022 Additional implementation details
+      for BERT are presented in Appendix A;\u2022 Additional details for our
+      experiments are presented in Appendix B; and \u2022 Additional ablation
+      studies are presented in Appendix C. We present additional ablation
+      studies for BERT including: \u2013Effect of Number of Training Steps; and
+      \u2013Ablation for Different"]} {"question": "What is the main difference
+      between BERT and previous language representation models?", "variant_id":
+      "v1", "line_number": 2, answer":"BERT is designed to pre-train deep
+      bidirectional representations from unlabeled text by jointly conditioning
+      on both left and right context in all layers, allowing it to incorporate
+      context from both directions. This is unlike previous language
+      representation models that are unidirectional, which limits the choice of
+      architectures that can be used during pre-training and could be
+      sub-optimal for sentence-level tasks and token-level tasks such as
+      question answering.","context":["BERT: Pre-training of Deep Bidirectional
+      Transformers for Language Understanding Jacob Devlin Ming-Wei Chang Kenton
+      Lee Kristina Toutanova Google AI Language
+      fjacobdevlin,mingweichang,kentonl,kristout g@google.com Abstract We
+      introduce a new language representa- tion model called BERT , which stands
+      for Bidirectional Encoder Representations from Transformers. Unlike recent
+      language repre- sentation models (Peters et al., 2018a; Rad- ford et al.,
+      2018), BERT is designed to pre- train deep bidirectional representations
+      from unlabeled text by jointly conditioning on both left and right context
+      in all layers. As a re- sult, the pre-trained BERT model can be \ufb01ne-
+      tuned with just one additional output layer to create state-of-the-art
+      models for a wide range of tasks, such as question answering and language
+      inference, without substantial task- speci\ufb01c architecture
+      modi\ufb01cations. BERT is conceptually simple and empirically powerful.
+      It obtains new state-of-the-art re- sults on eleven natural language
+      processing tasks, including pushing the GLUE score to 80.5% (7.7% point
+      absolute improvement), MultiNLI accuracy to 86.7% (4.6% absolute
+      improvement), SQuAD v1.1 question answer- ing Test F1 to 93.2 (1.5 point
+      absolute im- provement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute
+      improvement). 1 Introduction Language model pre-training has been shown to
+      be effective for improving many natural language processing tasks (Dai and
+      Le, 2015; Peters et al., 2018a; Radford et al., 2018; Howard and Ruder,
+      2018). These include sentence-level tasks such as natural language
+      inference (Bowman et al., 2015; Williams et al., 2018) and paraphrasing
+      (Dolan and Brockett, 2005), which aim to predict the re- lationships
+      between sentences by analyzing them holistically, as well as token-level
+      tasks such as named entity recognition and question answering, where
+      models are required to produce \ufb01ne-grained output at the token level
+      (Tjong Kim Sang and De Meulder, 2003; Rajpurkar et al., 2016).There are
+      two existing strategies for apply- ing pre-trained language
+      representations to down- stream tasks: feature-based and\ufb01ne-tuning .
+      The feature-based approach, such as ELMo (Peters et al., 2018a), uses
+      task-speci\ufb01c architectures that include the pre-trained
+      representations as addi- tional features. The \ufb01ne-tuning approach,
+      such as the Generative Pre-trained Transformer (OpenAI GPT) (Radford et
+      al., 2018), introduces minimal task-speci\ufb01c parameters, and is
+      trained on the downstream tasks by simply \ufb01ne-tuning allpre- trained
+      parameters. The two approaches share the same objective function during
+      pre-training, where they use unidirectional language models to learn
+      general language representations. We argue that current techniques
+      restrict the power of the pre-trained representations, espe- cially for
+      the \ufb01ne-tuning approaches. The ma- jor limitation is that standard
+      language models are unidirectional, and this limits the choice of archi-
+      tectures that can be used during pre-training. For example, in OpenAI GPT,
+      the authors use a left-to- right architecture, where every token can only
+      at- tend to previous tokens in the self-attention layers of the
+      Transformer (Vaswani et al., 2017). Such re- strictions are sub-optimal
+      for sentence-level tasks, and could be very harmful when applying
+      \ufb01ne- tuning based approaches to token-level tasks such as question
+      answering, where it is crucial to incor- porate context from both
+      directions. In this paper, we improve the \ufb01ne-tuning based approaches
+      by proposing BERT: Bidirectional Encoder Representations from
+      Transformers.","the self-attention layers of the Transformer (Vaswani et
+      al., 2017). Such re- strictions are sub-optimal for sentence-level tasks,
+      and could be very harmful when applying \ufb01ne- tuning based approaches
+      to token-level tasks such as question answering, where it is crucial to
+      incor- porate context from both directions. In this paper, we improve the
+      \ufb01ne-tuning based approaches by proposing BERT: Bidirectional Encoder
+      Representations from Transformers. BERT alleviates the previously
+      mentioned unidi- rectionality constraint by using a \u201cmasked lan-
+      guage model\u201d (MLM) pre-training objective, in- spired by the Cloze
+      task (Taylor, 1953). The masked language model randomly masks some of the
+      tokens from the input, and the objective is to predict the original
+      vocabulary id of the maskedarXiv:1810.04805v2 [cs.CL] 24 May 2019word
+      based only on its context. Unlike left-to- right language model
+      pre-training, the MLM ob- jective enables the representation to fuse the
+      left and the right context, which allows us to pre- train a deep
+      bidirectional Transformer. In addi- tion to the masked language model, we
+      also use a \u201cnext sentence prediction\u201d task that jointly pre-
+      trains text-pair representations. The contributions of our paper are as
+      follows: \u2022 We demonstrate the importance of bidirectional
+      pre-training for language representations. Un- like Radford et al. (2018),
+      which uses unidirec- tional language models for pre-training, BERT uses
+      masked language models to enable pre- trained deep bidirectional
+      representations. This is also in contrast to Peters et al. (2018a), which
+      uses a shallow concatenation of independently trained left-to-right and
+      right-to-left LMs. \u2022 We show that pre-trained representations reduce
+      the need for many heavily-engineered task- speci\ufb01c architectures.
+      BERT is the \ufb01rst \ufb01ne- tuning based representation model that
+      achieves state-of-the-art performance on a large suite of sentence-level
+      andtoken-level tasks, outper- forming many task-speci\ufb01c
+      architectures. \u2022 BERT advances the state of the art for eleven NLP
+      tasks. The code and pre-trained mod- els are available at
+      https:\/\/github.com\/ google-research\/bert . 2 Related Work There is a
+      long history of pre-training general lan- guage representations, and we
+      brie\ufb02y review the most widely-used approaches in this section. 2.1
+      Unsupervised Feature-based Approaches Learning widely applicable
+      representations of words has been an active area of research for decades,
+      including non-neural (Brown et al., 1992; Ando and Zhang, 2005; Blitzer et
+      al., 2006) and neural (Mikolov et al., 2013; Pennington et al., 2014)
+      methods. Pre-trained word embeddings are an integral part of modern NLP
+      systems, of- fering signi\ufb01cant improvements over embeddings learned
+      from scratch (Turian et al., 2010). To pre- train word embedding vectors,
+      left-to-right lan- guage modeling objectives have been used (Mnih and
+      Hinton, 2009), as well as objectives to dis- criminate correct from
+      incorrect words in left and right context (Mikolov et al., 2013).These
+      approaches have been generalized to coarser granularities, such as
+      sentence embed- dings (Kiros et al., 2015; Logeswaran and Lee, 2018). "]'
+outputs:
+  perceived_intelligence_score:
+    type: string
+    reference: ${parse_score.output}
+nodes:
+- name: parse_score
+  type: python
+  source:
+    type: code
+    path: parse_score.py
+  inputs:
+    gpt_score: ${gpt_perceived_intelligence.output}
+- name: aggregate
+  type: python
+  source:
+    type: code
+    path: aggregate.py
+  inputs:
+    perceived_intelligence_score: ${parse_score.output}
+  aggregation: true
+- name: gpt_perceived_intelligence
+  type: llm
+  source:
+    type: code
+    path: gpt_perceived_intelligence.md
+  inputs:
+    deployment_name: gpt-35-turbo
+    max_tokens: 5
+    answer: ${inputs.answer}
+    question: ${inputs.question}
+    context: ${inputs.context}
+    temperature: 0
+  provider: AzureOpenAI
+  connection: azure_open_ai_connection
+  api: chat
+  module: promptflow.tools.aoai
diff --git a/examples/flows/evaluation/perceived-intelligence-eval/gpt_perceived_intelligence.md b/examples/flows/evaluation/perceived-intelligence-eval/gpt_perceived_intelligence.md
new file mode 100644
index 00000000000..cb58847b687
--- /dev/null
+++ b/examples/flows/evaluation/perceived-intelligence-eval/gpt_perceived_intelligence.md
@@ -0,0 +1,23 @@
+user:
+# Instructions
+
+* There are many chatbots that can answer users questions based on the context given from different sources like search results, or snippets from books/papers. They try to understand users's question and then get context by either performing search from search engines, databases or books/papers for relevant content. Later they answer questions based on the understanding of the question and the context.
+* Perceived intelligence is the degree to which a bot can impress the user with its responses, by showing originality, insight, creativity, knowledge, and adaptability. Perceived intelligence can be influenced by various factors, such as the content, tone, style, and structure of the bot's responses, the relevance, coherence, and accuracy of the information the bot provides, the creativity, originality, and wit of the bot's expressions, the depth, breadth, and insight of the bot's knowledge, and the ability of the bot to adapt, learn, and use feedback.
+* Your goal is to score the answer for given question and context from 1 to 10 based on perceived intelligence described above:
+    * Score 10 means the answer is excellent for percieved intelligence
+    * Score 1 means the answer is poor for percieved intelligence
+    * Score 5 means the answer is normal for percieved intelligence
+* Just respond with the score, nothing else.
+  
+# Real work
+
+## Question
+{{question}}
+
+## Answer
+{{answer}}
+
+## Context
+{{context}}
+
+## Score
\ No newline at end of file
diff --git a/examples/flows/evaluation/perceived-intelligence-eval/parse_score.py b/examples/flows/evaluation/perceived-intelligence-eval/parse_score.py
new file mode 100644
index 00000000000..0375ad65729
--- /dev/null
+++ b/examples/flows/evaluation/perceived-intelligence-eval/parse_score.py
@@ -0,0 +1,15 @@
+from promptflow import tool
+import re
+
+
+@tool
+def parse_score(gpt_score: str):
+    return float(extract_float(gpt_score))
+
+
+def extract_float(s):
+    match = re.search(r"[-+]?\d*\.\d+|\d+", s)
+    if match:
+        return float(match.group())
+    else:
+        return None
diff --git a/examples/flows/evaluation/perceived-intelligence-eval/requirements.txt b/examples/flows/evaluation/perceived-intelligence-eval/requirements.txt
new file mode 100644
index 00000000000..16306751554
--- /dev/null
+++ b/examples/flows/evaluation/perceived-intelligence-eval/requirements.txt
@@ -0,0 +1,3 @@
+--extra-index-url https://azuremlsdktestpypi.azureedge.net/promptflow/
+promptflow
+promptflow-tools
\ No newline at end of file
diff --git a/examples/flows/standard/named-entity-recognition/NER_LLM.jinja2 b/examples/flows/standard/named-entity-recognition/NER_LLM.jinja2
new file mode 100644
index 00000000000..a58a23aad51
--- /dev/null
+++ b/examples/flows/standard/named-entity-recognition/NER_LLM.jinja2
@@ -0,0 +1,10 @@
+system:
+Your task is to find entities of certain type from the given text content.
+If there're multiple entities, please return them all with comma separated, e.g. "entity1, entity2, entity3".
+You should only return the entity list, nothing else.
+If there's no such entity, please return "None".
+
+user:
+Entity type: {{entity_type}}
+Text content: {{text}}
+Entities:
\ No newline at end of file
diff --git a/examples/flows/standard/named-entity-recognition/README.md b/examples/flows/standard/named-entity-recognition/README.md
new file mode 100644
index 00000000000..afd2fcf71e8
--- /dev/null
+++ b/examples/flows/standard/named-entity-recognition/README.md
@@ -0,0 +1,53 @@
+# Named Entity Recognition
+A flow that perform named entity recognition task. 
+
+Tools used in this flow：
+- `python` tool
+- built-in `llm` tool
+
+Connections used in this flow:
+- `azure_open_ai` connection
+
+## Prerequisites
+
+Install promptflow sdk and other dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+## Setup connection
+Prepare your Azure Open AI resource follow this [instruction](https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal) and get your `api_key` if you don't have one.
+
+Note in this example, we are using [chat api](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/chatgpt?pivots=programming-language-chat-completions), please use `gpt-35-turbo` or `gpt-4` model deployment.
+
+Create connection if you haven't done that. Ensure you have put your azure open ai endpoint key in [azure_openai.yml](azure_openai.yml) file. 
+```bash
+# Override keys with --set to avoid yaml file changes
+pf connection create -f ../../../connections/azure_openai.yml --set api_key=<your_api_key> api_base=<your_api_base>
+```
+
+Ensure you have created `azure_open_ai_connection` connection.
+```bash
+pf connection show -n azure_open_ai_connection
+```
+
+
+## Run flow in local
+
+### Run locally with single line input
+
+```bash
+# test with default input value in flow.dag.yaml
+pf flow test --flow .
+# test with specific input
+pf flow test --flow . --inputs text='The phone number (321) 654-0987 is no longer in service' entity_type='phone number'
+```
+
+### run with multiple lines data
+
+- create run
+```bash
+pf run create --flow . --data ./data.jsonl --stream
+```
+
+
diff --git a/examples/flows/standard/named-entity-recognition/cleansing.py b/examples/flows/standard/named-entity-recognition/cleansing.py
new file mode 100644
index 00000000000..a017d7f1ab9
--- /dev/null
+++ b/examples/flows/standard/named-entity-recognition/cleansing.py
@@ -0,0 +1,11 @@
+from typing import List
+from promptflow import tool
+
+
+@tool
+def cleansing(entities_str: str) -> List[str]:
+    # Split, remove leading and trailing spaces/tabs/dots
+    parts = entities_str.split(",")
+    cleaned_parts = [part.strip(" \t.\"") for part in parts]
+    entities = [part for part in cleaned_parts if len(part) > 0]
+    return entities
diff --git a/examples/flows/standard/named-entity-recognition/cleansing_test.py b/examples/flows/standard/named-entity-recognition/cleansing_test.py
new file mode 100644
index 00000000000..4b69215ea8b
--- /dev/null
+++ b/examples/flows/standard/named-entity-recognition/cleansing_test.py
@@ -0,0 +1,13 @@
+import unittest
+
+from cleansing import cleansing
+
+
+class CleansingTest(unittest.TestCase):
+    def test_normal(self):
+        self.assertEqual(cleansing("a, b, c"), ["a", "b", "c"])
+        self.assertEqual(cleansing("a, b, (425)137-98-25, "), ["a", "b", "(425)137-98-25"])
+        self.assertEqual(cleansing("a, b, F. Scott Fitzgerald.,  d"), ["a", "b", "F. Scott Fitzgerald", "d"])
+        self.assertEqual(cleansing("a, b, c,  None., "), ["a", "b", "c", "None"])
+        self.assertEqual(cleansing(",,"), [])
+        self.assertEqual(cleansing(""), [])
diff --git a/examples/flows/standard/named-entity-recognition/data.jsonl b/examples/flows/standard/named-entity-recognition/data.jsonl
new file mode 100644
index 00000000000..12e8b3d96ed
--- /dev/null
+++ b/examples/flows/standard/named-entity-recognition/data.jsonl
@@ -0,0 +1,15 @@
+{"text": "The software engineer is working on a new update for the application.", "entity_type": "job title", "results": "software engineer"}
+{"text": "The project manager and the data analyst are collaborating to interpret the project data.", "entity_type": "job title", "results": "project manager, data analyst"}
+{"text": "The marketing manager is coordinating with the graphic designer to create a new advertisement campaign.", "entity_type": "job title", "results": "marketing manager, graphic designer"}
+{"text": "The CEO and CFO are discussing the financial forecast for the next quarter.", "entity_type": "job title", "results": "CEO, CFO"}
+{"text": "The web developer and UX designer are working together to improve the website's user interface.", "entity_type": "job title", "results": "web developer, UX designer"}
+{"text": "John finally decided to change his phone number after receiving too many spam calls.", "results": "None", "entity_type": "phone number"}
+{"text": "If you have any questions about our products, please call our customer service at (123) 456-7890.", "results": "(123) 456-7890", "entity_type": "phone number"}
+{"text": "My new phone number is (098) 765-4321, please update your contact list.", "results": "(098) 765-4321", "entity_type": "phone number"}
+{"text": "The phone number (321) 654-0987 is no longer in service.", "results": "(321) 654-0987", "entity_type": "phone number"}
+{"text": "Please dial the following phone number: (555) 123-4567 to reach our technical support.", "results": "(555) 123-4567", "entity_type": "phone number"}
+{"text": "John Doe has been appointed as the new CEO of the company.", "entity_type":"people's full name", "results":"John Doe"}
+{"text": "The novel 'The Great Gatsby' was written by F. Scott Fitzgerald.", "entity_type":"people's full name", "results":"F. Scott Fitzgerald"}
+{"text": "Mary Jane Watson and Peter Parker are characters in the Spider-Man series.", "entity_type":"people's full name", "results":"Mary Jane Watson, Peter Parker"}
+{"text": "The famous physicists, Albert Einstein and Isaac Newton, made significant contributions to the field of physics.", "entity_type":"people's full name", "results":"Isaac Newton, Albert Einstein"}
+{"text": "The Eiffel Tower is an iconic landmark in Paris.", "entity_type":"people's full name", "results":"None"}
\ No newline at end of file
diff --git a/examples/flows/standard/named-entity-recognition/eval_test.py b/examples/flows/standard/named-entity-recognition/eval_test.py
new file mode 100644
index 00000000000..d4a4ee99050
--- /dev/null
+++ b/examples/flows/standard/named-entity-recognition/eval_test.py
@@ -0,0 +1,120 @@
+import unittest
+import traceback
+import os
+import promptflow.azure as azure
+from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
+from azure.ai.ml import MLClient
+import promptflow
+
+
+class BaseTest(unittest.TestCase):
+    def setUp(self) -> None:
+        root = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../")
+        self.flow_path = os.path.join(root, "named_entity_recognition")
+        self.data_path = os.path.join(self.flow_path, "test_data.jsonl")
+        self.eval_match_rate_flow_path = os.path.join(root, "eval_entity_match_rate")
+        self.all_runs_generated = []
+
+        return super().setUp()
+
+    def tearDown(self):
+        for run in self.all_runs_generated:
+            try:
+                self.pf.runs.archive(run.name)
+            except Exception as e:
+                print(e)
+                traceback.print_exc()
+
+        return super().setUp()
+
+    def check_run_basics(self, run, name):
+        self.assertTrue(run is not None)
+        self.assertEqual(run.display_name, name)
+        self.assertEqual(run.tags["unittest"], "true")
+
+
+class TestEvalAzure(BaseTest):
+    def setUp(self) -> None:
+        try:
+            credential = DefaultAzureCredential()
+            # Check if given credential can get token successfully.
+            credential.get_token("https://management.azure.com/.default")
+        except Exception:
+            # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
+            credential = InteractiveBrowserCredential()
+
+        ml_client = MLClient.from_config(
+            credential=credential,
+        )
+
+        self.pf = azure.PFClient(ml_client)
+        return super().setUp()
+
+    def test_bulk_run_and_eval(self):
+        run = self.pf.run(
+            flow=self.flow_path,
+            data=self.data_path,
+            column_mapping={
+                "text": "${data.text}",
+                "entity_type": "${data.entity_type}"
+            },
+            connections={"NER_LLM": {"connection": "azure_open_ai_connection"}},
+            runtime="chat_with_pdf_runtime",
+            display_name="ner_bulk_run",
+            tags={"unittest": "true"},
+            stream=True)
+        self.all_runs_generated.append(run)
+        self.check_run_basics(run, "ner_bulk_run")
+
+        eval = self.pf.run(
+            flow=self.eval_match_rate_flow_path,
+            run=run,
+            data=self.data_path,
+            column_mapping={
+                "entities": "${run.outputs.entities}",
+                "ground_truth": "${data.results}"
+            },
+            runtime="chat_with_pdf_runtime",
+            display_name="eval_match_rate",
+            tags={"unittest": "true"},
+            stream=True)
+        self.all_runs_generated.append(eval)
+        self.check_run_basics(eval, "eval_match_rate")
+
+        return eval
+
+
+class TestEval(BaseTest):
+    def setUp(self) -> None:
+        self.pf = promptflow.PFClient()
+        return super().setUp()
+
+    def test_bulk_run_and_eval(self):
+        run = self.pf.run(
+            flow=self.flow_path,
+            data=self.data_path,
+            column_mapping={
+                "text": "${data.text}",
+                "entity_type": "${data.entity_type}"
+            },
+            display_name="ner_bulk_run",
+            tags={"unittest": "true"},
+            stream=True)
+        self.all_runs_generated.append(run)
+        self.check_run_basics(run, "ner_bulk_run")
+
+        eval = self.pf.run(
+            flow=self.eval_match_rate_flow_path,
+            run=run,
+            data=self.data_path,
+            column_mapping={
+                "entities": "${run.outputs.entities}",
+                "ground_truth": "${data.results}"
+            },
+            display_name="eval_match_rate",
+            tags={"unittest": "true"},
+            stream=True)
+        self.all_runs_generated.append(eval)
+        self.check_run_basics(eval, "eval_match_rate")
+
+        return eval
diff --git a/examples/flows/standard/named-entity-recognition/flow.dag.yaml b/examples/flows/standard/named-entity-recognition/flow.dag.yaml
new file mode 100644
index 00000000000..15b03707166
--- /dev/null
+++ b/examples/flows/standard/named-entity-recognition/flow.dag.yaml
@@ -0,0 +1,36 @@
+inputs:
+  entity_type:
+    type: string
+    default: job title
+  text:
+    type: string
+    default: Maxime is a data scientist at Auto Dataset, and his wife is a finance
+      manager in the same company.
+outputs:
+  entities:
+    type: string
+    reference: ${cleansing.output}
+nodes:
+- name: NER_LLM
+  type: llm
+  source:
+    type: code
+    path: NER_LLM.jinja2
+  inputs:
+    deployment_name: gpt-35-turbo
+    max_tokens: 64
+    text: ${inputs.text}
+    entity_type: ${inputs.entity_type}
+  provider: AzureOpenAI
+  connection: azure_open_ai_connection
+  api: chat
+  module: promptflow.tools.aoai
+- name: cleansing
+  type: python
+  source:
+    type: code
+    path: cleansing.py
+  inputs:
+    entities_str: ${NER_LLM.output}
+environment:
+  python_requirements_txt: requirements.txt
\ No newline at end of file
diff --git a/examples/flows/standard/named-entity-recognition/requirements.txt b/examples/flows/standard/named-entity-recognition/requirements.txt
new file mode 100644
index 00000000000..16306751554
--- /dev/null
+++ b/examples/flows/standard/named-entity-recognition/requirements.txt
@@ -0,0 +1,3 @@
+--extra-index-url https://azuremlsdktestpypi.azureedge.net/promptflow/
+promptflow
+promptflow-tools
\ No newline at end of file
diff --git a/examples/requirements.txt b/examples/requirements.txt
index 0cce304ad87..ed4ed784cc5 100644
--- a/examples/requirements.txt
+++ b/examples/requirements.txt
@@ -1,6 +1,6 @@
 # remove when we publish to pypi
 --extra-index-url https://azuremlsdktestpypi.azureedge.net/promptflow/
-promptflow[azure]==0.0.102010695
+promptflow[azure]==0.0.102309906
 promptflow-tools==0.1.0.b3
 python-dotenv
 langchain
diff --git a/examples/tutorials/e2e-development/chat-with-pdf.md b/examples/tutorials/e2e-development/chat-with-pdf.md
new file mode 100644
index 00000000000..b5f6d52160c
--- /dev/null
+++ b/examples/tutorials/e2e-development/chat-with-pdf.md
@@ -0,0 +1,258 @@
+# Tutorial: Chat with PDF
+
+In this tutorial, we will create a simple flow that mimic the functionality of retrieval of relevant information from the PDF to generate an answer with GPT. 
+
+## Overview
+Retrieval Augmented Generation (or RAG) becomes a common pattern to build intelligent application with Large Language Models (or LLMs) since it can infuse external knowledge into the model, which is not trained with those up-to-date or proprietary information. The screenshot below shows how new Bing in Edge sidebar can answer questions based on the page content on the left - in this case is a PDF file.
+![edge-chat-pdf](../../flows/chat/chat-with-pdf/assets/edge-chat-pdf.png)
+Note that new Bing will also search web for more information to generate the answer, let's ignore that part for now.
+
+In this tutorial we will try to mimic the functionality of retrieval of relevant information from the PDF to generate an answer with GPT. 
+
+**We will show you how to**: 
+1. Create a console chatbot "chat_with_pdf" which accepts an url to PDF file as argument, then answer questions based on the content of the PDF.
+2. Create a prompt flow for the chatbot, mostly reuse the code from step #1 above.
+3. Create a dataset with multiple questions to quick test the flow.
+4. Evaluate the quality of the answers generated by the chat_with_pdf flow.
+5. Integrating the tests and evaluations into your development cycle - unit tests and CI/CD.
+6. Deploy the flow to Azure App Service and Streamlit to serve real user traffic.
+
+## Prerequisite
+To go through this tutorial:
+1. Install dependencies
+```bash
+   cd ../../flows/chat/chat-with-pdf/
+   pip install -r requirements.txt
+```
+
+2. Install VS code extension (optional but highly recommended)
+   // TODO
+
+## Console chatbot chat_with_pdf
+A typical RAG application has two steps:
+- **Retrieval**: Retrieve context information from external systems (database, search engine, files, etc.)
+- **Generation**: Construct the prompt with the retrieved context and get response from LLMs.
+
+Since the retrieval step is more of a search problem, it can get really complicated. A popular, simple but also effective approach is vector search, which requires a index building process. Say if you have one or more documents containing the context information, the index building process looks like:
+1. **Chunk** Turn the documents into multiple chunks of text.
+2. **Embedding** For each of the text chunk call an embedding model to turn it into an array of floats (we call it embedding or vector).
+3. **Indexing** Put the vectors into an index or a database which supports vector search - returning the top K relevant/similar vectors from the index or DB.
+
+Once the index is built, the **Retrieval** step is just about turning the quesion into embedding/vector and perform a vector search on the index to get the most relevant context for the question.
+
+OK now back to the chatbot we want to build, a simplified design can be:
+
+<img src="../../flows/chat/chat-with-pdf/assets/chat_with_pdf_simple.png" width="300" alt="chat with pdf simple design"/>
+
+A more robust or real application should consider using an external vector DB to store the vectors, for this simple example we choose to use [FAISS](https://github.com/facebookresearch/faiss) index which can be stored as a file. To avoid downloading and index building for same PDF file again and again, we will add a check that if the PDF file already exists then we won't download, same for index building.
+
+This design can serve question and answering pretty well, except when you come to multi-turn conversation with the chatbot. Imagine a scenario like this:
+
+> $User: what is BERT?
+>
+> $Bot: BERT stands for Bidirectional Encoder Representations from Transformers. 
+>
+> $User: is it better than GPT?
+>
+> $Bot: ...
+
+You would expect the chatbot smart enough to figure out the **it** in your second question stands for BERT and you question is really "is BERT better than GPT". However, if you throw the question "is it better than GPT" to embedding model and then vector index/db, they won't know **it** stands for BERT. Thus you won't get the most relevant context back from the index. To fix this issue, we will get help from LLM (GPT) to "rewrite" the question based on previous question. The new design looks like this:
+
+<img src="../../flows/chat/chat-with-pdf/assets/chat_with_pdf_with_rewrite.png" width="400" alt="chat with pdf better design"/>
+
+A "rewrite_question" step is performed before feeding the question to "find_context" step.
+
+### Take a look at the chatbot in action!
+You should be able to run the console app by:
+```shell
+python chat_with_pdf/main.py https://arxiv.org/pdf/1810.04805.pdf
+```
+> Note: https://arxiv.org/pdf/1810.04805.pdf is the paper about one of the most famous earlier LLMs: BERT.
+
+It looks like below if everything goes fine:
+![chatbot console](../../flows/chat/chat-with-pdf/assets/chatbot_console.gif)
+
+Now let's look at actual code implmenting the chatbot.
+
+### Configurations
+While this is a minimal LLM app, there are a few things we later want to configure or even experiment with, let's put them into environment variables. In later section we will show you how to experiment with these configurations to get a better quality for the chat app.
+
+Create .env file in this folder with below content and later load_dotenv() can be used to populate these into environment variables. We will explain what these are later when talking about how each step is implemented.
+
+Check out [example env file](.env.example).
+```ini
+OPENAI_API_BASE=<AOAI_endpoint>
+OPENAI_API_KEY=<AOAI_key>
+OPENAI_API_VERSION=2023-05-15
+EMBEDDING_MODEL_DEPLOYMENT_NAME=text-embedding-ada-002
+CHAT_MODEL_DEPLOYMENT_NAME=gpt-35-turbo
+PROMPT_TOKEN_LIMIT=3000
+MAX_COMPLETION_TOKENS=256
+CHUNK_SIZE=1024
+CHUNK_OVERLAP=64
+VERBOSE=False
+```
+Note: CHAT_MODEL_DEPLOYMENT_NAME should point to a chat model like gpt-3.5-turbo or gpt-4
+
+```bash
+# create connection needed by flow
+if pf connection list | grep azure_open_ai_connection; then
+    echo "azure_open_ai_connection already exists"
+else
+    pf connection create --file azure_openai.yml --name azure_open_ai_connection --set api_key=<your_api_key> api_base=<your_api_base>
+fi
+```
+
+### Implementation of each steps
+#### Download pdf: [download.py](../../flows/chat/chat-with-pdf/chat_with_pdf/download.py)
+The downloaded PDF file will be stored into a temp folder.
+
+#### Build index: [build_index.py](../../flows/chat/chat-with-pdf/chat_with_pdf/build_index.py)
+A few libraries are used in this step to build index:
+1. PyPDF2 for extraction of text from the PDF file.
+2. OpenAI python library for generating embedding.
+3. FAISS library to construct vector index and persist to a file. Note there is a another dict used to store the mapping from the vector index to the actual text snippet because when later we try to query for most relevant context we need to find the text snippets instead of just embedding/vector.
+   
+#### Rewrite question: [rewrite_question.py](../../flows/chat/chat-with-pdf/chat_with_pdf/rewrite_question.py)
+This step is to use ChatGPT/GPT4 to rewrite the question to be better fit for finding relevant context from the vector index. The prompt file [rewrite_question.md](../../flows/chat/chat-with-pdf/chat_with_pdf/rewrite_question_prompt.md) should give you a better idea how it works. 
+
+#### Find context: [find_context.py](../../flows/chat/chat-with-pdf/chat_with_pdf/find_context.py)
+Loading FAISS index and the dict that are built in the "build index" step, turn the question into a vector (using the same embedding function in the build index step). There is a small trick in this step to make sure the context will not exceed the token limit of model input prompt ([aoai model max request tokens](https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models), OpenAI has similar limit). The output of this step is the final prompt that QnA step will send to chat model.
+
+#### QnA: [qna.py](../../flows/chat/chat-with-pdf/chat_with_pdf/qna.py)
+Use OpenAI's ChatGPT or GPT4 model and ChatCompletion API to get a good answer from the history and context from PDF.
+
+#### The main loop: [main.py](../../flows/chat/chat-with-pdf/chat_with_pdf/main.py)
+This is the main entry of the chatbot with a loop to read question from user input and call the steps above to answer the question.
+
+To make this example simpler, we store downloaded file and also the built index as local files, though there is a mechanism to honor cached file/index, the index loading still takes certain time and add up to the latency that user can feel. Also if the chatbot is hosted on a server then it requires the requests for same PDF file to hit same server node to make the cache effective. In a real-world scenario, you will likely want to store the index in a centralized service/DB instead. There're many such DB available like [Azure Cognitive Search](https://learn.microsoft.com/en-us/azure/search/vector-search-overview), [Pinecone](https://www.pinecone.io/), [Qdrant](https://qdrant.tech/), ...
+
+## Prompt flow: when you start thinking about quality of your LLM app
+Now the simple chatbot is working, which straightforward. And this is just the first step or the simpler part of the journey. Just like any machine learning based application, building a good LLM app usually requires a lot of tuning work, e.g. trying different prompts (rewrite question, QnA), different parameters (chunk size, overlap size, context limit etc.) or even different workflow design (in our example with or without rewrite_question step is a configuration you may want to experiment with).
+
+We need proper tooling to allow experimenting and tuning with these LLM apps. Prompt flow is designed for this and it enables you to test your LLM apps with:
+- Running a few examples and manually verify how it works out.
+- Running larger scale test dataset and use more formal approach (with metrics) to evaluate the quality of your app.
+
+You might learnt how to create a prompt flow from scratch. Building a prompt flow from existing code is also straightforward. You can construct a [chat flow]() either by composing the YAML file or using the visual editor of [Visual Studio Code extension]() and create a few wrappers for existing code. 
+
+Check out below:
+- [flow.dag.yaml](../../flows/chat/chat-with-pdf/flow.dag.yaml)
+- [setup_env.py](../../flows/chat/chat-with-pdf/setup_env.py)
+- [download_tool.py](../../flows/chat/chat-with-pdf/download_tool.py)
+- [build_index_tool.py](../../flows/chat/chat-with-pdf/build_index_tool.py)
+- [rewrite_question_tool.py](../../flows/chat/chat-with-pdf/rewrite_question_tool.py)
+- [find_context_tool.py](../../flows/chat/chat-with-pdf/find_context_tool.py)
+- [qna_tool.py](../../flows/chat/chat-with-pdf/qna_tool.py)
+
+E.g. build_index_tool wrapper:
+```python
+from promptflow import tool
+from chat_with_pdf.build_index import create_faiss_index
+
+
+@tool
+def build_index_tool(pdf_path: str) -> str:
+    return create_faiss_index(pdf_path)
+```
+
+The setup_env node requires some explanation: you might recall that we use environment to manage different configurations including OpenAI API key in the console chatbot, in prompt flow we use [Connection]() to manage access to external services like OpenAI and support passing configuration object into flow so that you can do experimentation easier. The setup_env node is to write the properties from connection and configuration object into environment variables, so that the core code of chatbot can work as-is.
+
+The flow looks like:
+<img src="../../flows/chat/chat-with-pdf/assets/multi-node-flow-chat-with-pdf.png" width="500" alt="chat with pdf flow, multi-node"/>
+
+## Prompt flow evaluations
+Now the prompt flow for chat_with_pdf is created, you might have already run/debug flow through the [Visual Studio Code extension](). It's time to do some testing and evaluation, which starts with:
+1. Create a test dataset which contains a few question and pdf_url pairs.
+2. Use existing [evaluation flows]() or develop new evaluation flows to generate metrics.
+
+A small dataset can be found here: [bert-paper-qna.jsonl](../../flows/chat/chat-with-pdf/data/bert-paper-qna.jsonl) which contains around 10 questions for the BERT paper.
+
+You can do a batch run with the test dataset and manual review the output. This can be done through the Visual Studio Code extension, or CLI or Python SDK.
+
+**batch_run.yaml**
+```yaml
+name: chat_with_pdf_default_20230820_162219_559000
+flow: .
+data: ./data/bert-paper-qna.jsonl
+#run: <Uncomment to select a run input>
+column_mapping:
+  chat_history: ${data.chat_history}
+  pdf_url: ${data.pdf_url}
+  question: ${data.question}
+  config: 
+    EMBEDDING_MODEL_DEPLOYMENT_NAME: text-embedding-ada-002
+    CHAT_MODEL_DEPLOYMENT_NAME: gpt-35-turbo
+    PROMPT_TOKEN_LIMIT: 3000
+    MAX_COMPLETION_TOKENS: 256
+    VERBOSE: true
+    CHUNK_SIZE: 1024
+    CHUNK_OVERLAP: 64
+```
+**CLI**
+```bash
+run_name="web_classification_"$(openssl rand -hex 12)
+pf run create --file batch_run.yaml --stream --name $run_name
+```
+
+The output will include something like below:
+```json
+{
+    "name": "chat_with_pdf_default_20230820_162219_559000",
+    "created_on": "2023-08-20T16:23:39.608101",
+    "status": "Completed",
+    "display_name": "chat_with_pdf_default_20230820_162219_559000",
+    "description": null,
+    "tags": null,
+    "properties": {
+        "flow_path": "/Users/jietong/Work/azure-promptflow/scratchpad/chat_with_pdf",
+        "output_path": "/Users/jietong/.promptflow/.runs/chat_with_pdf_default_20230820_162219_559000"
+    },
+    "flow_name": "chat_with_pdf",
+    "data": "/Users/jietong/Work/azure-promptflow/scratchpad/chat_with_pdf/data/bert-paper-qna.jsonl",
+    "output": "/Users/jietong/.promptflow/.runs/chat_with_pdf_default_20230820_162219_559000/    flow_outputs/output.jsonl"
+}
+```
+
+And we developed two evaluation flows one for "[groundedness](../../evaluation/groundness-eval/)" and one for "[perceived intelligence](../../evaluation/perceived-intelligence-eval/)". Reading the prompts will give you better idea what are these two metrics:
+- [groundedness prompt](../eval_groundedness/gpt_groundedness.md)
+- [perceived intelligence prompt](../eval_perceived_intelligence/gpt_perceived_intelligence.md)
+
+Evaluation is also a batch run - batch run of evaluation flow with the previous run as input.
+
+**eval_run.yaml:**
+```yaml
+flow: ../../evaluation/groundedness-eval
+run: chat_with_pdf_default_20230820_162219_559000
+column_mapping:
+  question: ${run.inputs.question}
+  answer: ${run.outputs.answer}
+  context: ${run.outputs.context}
+```
+NOTE: the run property in eval_run.yaml is the run name of batch_run.yaml
+
+**CLI:**
+```bash
+eval_run_name="eval_groundedness_"$(openssl rand -hex 12)
+pf run create --file eval_run.yaml --run $run_name --name $eval_run_name
+```
+
+After the run completes you can use below commands to get detail of the runs:
+```bash
+pf run show-details --name $eval_run_name
+pf run show-metrics --name $eval_run_name
+pf run visualize --name $eval_run_name
+```
+
+## Integrate prompt flow into your CI/CD workflow
+Now you've seen how to run tests/evaluations for prompt flow. It's also straightforward to integrate these into your CI/CD workflow using either CLI or SDK. In this example we have various unit tests to run tests/evaluations for chat_with_pdf flow.
+
+Check the [test](../../flows/chat/chat-with-pdf/tests/) folder.
+
+```bash
+# run all the tests
+python -m unittest discover -s tests -p '*_test.py'
+```
+
+## Deployment
+//TODO: command line and screenshot of pf flow export and deploy to different cloud platforms
\ No newline at end of file
diff --git a/examples/tutorials/get-started/quickstart-azure.ipynb b/examples/tutorials/get-started/quickstart-azure.ipynb
index 53b167655a2..231d4810820 100644
--- a/examples/tutorials/get-started/quickstart-azure.ipynb
+++ b/examples/tutorials/get-started/quickstart-azure.ipynb
@@ -29,124 +29,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Looking in indexes: https://pypi.org/simple, https://azuremlsdktestpypi.azureedge.net/promptflow/\n",
-      "Requirement already satisfied: promptflow[azure]==0.0.101903259 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from -r ../../requirements.txt (line 3)) (0.0.101903259)\n",
-      "Requirement already satisfied: promptflow-tools==0.1.0.b3 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from -r ../../requirements.txt (line 4)) (0.1.0b3)\n",
-      "Requirement already satisfied: python-dotenv in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from -r ../../requirements.txt (line 5)) (1.0.0)\n",
-      "Requirement already satisfied: langchain in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from -r ../../requirements.txt (line 6)) (0.0.149)\n",
-      "Requirement already satisfied: jinja2 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from -r ../../requirements.txt (line 7)) (3.0.1)\n",
-      "Requirement already satisfied: bs4 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from -r ../../requirements.txt (line 8)) (0.0.1)\n",
-      "Requirement already satisfied: openai<0.28.0,>=0.27.8 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (0.27.8)\n",
-      "Requirement already satisfied: flask<3.0.0,>=2.2.3 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (2.2.3)\n",
-      "Requirement already satisfied: dataset<2.0.0,>=1.6.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (1.6.0)\n",
-      "Requirement already satisfied: sqlalchemy<2.0.0,>=1.4.48 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (1.4.48)\n",
-      "Requirement already satisfied: pandas<2.0.0,>=1.5.3 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (1.5.3)\n",
-      "Requirement already satisfied: keyring<25.0.0,>=24.2.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (24.2.0)\n",
-      "Requirement already satisfied: pydash<6.0.0,>=5.1.2 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (5.1.2)\n",
-      "Requirement already satisfied: cryptography<42.0.0,>=41.0.2 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (41.0.3)\n",
-      "Requirement already satisfied: colorama<0.5.0,>=0.4.6 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (0.4.6)\n",
-      "Requirement already satisfied: tabulate<1.0.0,>=0.9.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (0.9.0)\n",
-      "Requirement already satisfied: filelock<4.0.0,>=3.4.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (3.12.2)\n",
-      "Requirement already satisfied: marshmallow<4.0.0,>=3.5 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (3.19.0)\n",
-      "Requirement already satisfied: pyyaml<7.0.0,>=5.1.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (6.0)\n",
-      "Requirement already satisfied: azure-core<2.0.0,>=1.26.4 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (1.26.4)\n",
-      "Requirement already satisfied: azure-storage-blob<13.0.0,>=12.13.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (12.13.0)\n",
-      "Requirement already satisfied: azure-identity<2.0.0,>=1.12.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (1.12.0)\n",
-      "Requirement already satisfied: azure-ai-ml<2.0.0,>=1.9.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (1.9.0)\n",
-      "Requirement already satisfied: pyjwt<3.0.0,>=2.4.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (2.4.0)\n",
-      "Requirement already satisfied: google-search-results==2.4.1 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from promptflow-tools==0.1.0.b3->-r ../../requirements.txt (line 4)) (2.4.1)\n",
-      "Requirement already satisfied: requests in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from google-search-results==2.4.1->promptflow-tools==0.1.0.b3->-r ../../requirements.txt (line 4)) (2.31.0)\n",
-      "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from langchain->-r ../../requirements.txt (line 6)) (3.8.4)\n",
-      "Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from langchain->-r ../../requirements.txt (line 6)) (4.0.2)\n",
-      "Requirement already satisfied: dataclasses-json<0.6.0,>=0.5.7 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from langchain->-r ../../requirements.txt (line 6)) (0.5.9)\n",
-      "Requirement already satisfied: numexpr<3.0.0,>=2.8.4 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from langchain->-r ../../requirements.txt (line 6)) (2.8.4)\n",
-      "Requirement already satisfied: numpy<2,>=1 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from langchain->-r ../../requirements.txt (line 6)) (1.25.1)\n",
-      "Requirement already satisfied: openapi-schema-pydantic<2.0,>=1.2 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from langchain->-r ../../requirements.txt (line 6)) (1.2.4)\n",
-      "Requirement already satisfied: pydantic<2,>=1 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from langchain->-r ../../requirements.txt (line 6)) (1.10.11)\n",
-      "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from langchain->-r ../../requirements.txt (line 6)) (8.2.2)\n",
-      "Requirement already satisfied: tqdm>=4.48.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from langchain->-r ../../requirements.txt (line 6)) (4.65.0)\n",
-      "Requirement already satisfied: MarkupSafe>=2.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from jinja2->-r ../../requirements.txt (line 7)) (2.1.3)\n",
-      "Requirement already satisfied: beautifulsoup4 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from bs4->-r ../../requirements.txt (line 8)) (4.12.2)\n",
-      "Requirement already satisfied: attrs>=17.3.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain->-r ../../requirements.txt (line 6)) (23.1.0)\n",
-      "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain->-r ../../requirements.txt (line 6)) (3.2.0)\n",
-      "Requirement already satisfied: multidict<7.0,>=4.5 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain->-r ../../requirements.txt (line 6)) (6.0.4)\n",
-      "Requirement already satisfied: yarl<2.0,>=1.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain->-r ../../requirements.txt (line 6)) (1.9.2)\n",
-      "Requirement already satisfied: frozenlist>=1.1.1 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain->-r ../../requirements.txt (line 6)) (1.3.3)\n",
-      "Requirement already satisfied: aiosignal>=1.1.2 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain->-r ../../requirements.txt (line 6)) (1.3.1)\n",
-      "Requirement already satisfied: msrest>=0.6.18 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from azure-ai-ml<2.0.0,>=1.9.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (0.7.1)\n",
-      "Requirement already satisfied: azure-mgmt-core<2.0.0,>=1.3.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from azure-ai-ml<2.0.0,>=1.9.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (1.4.0)\n",
-      "Requirement already satisfied: jsonschema<5.0.0,>=4.0.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from azure-ai-ml<2.0.0,>=1.9.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (4.18.0)\n",
-      "Requirement already satisfied: strictyaml<2.0.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from azure-ai-ml<2.0.0,>=1.9.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (1.7.3)\n",
-      "Requirement already satisfied: azure-storage-file-share<13.0.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from azure-ai-ml<2.0.0,>=1.9.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (12.12.0)\n",
-      "Requirement already satisfied: azure-storage-file-datalake<13.0.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from azure-ai-ml<2.0.0,>=1.9.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (12.8.0)\n",
-      "Requirement already satisfied: isodate in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from azure-ai-ml<2.0.0,>=1.9.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (0.6.1)\n",
-      "Requirement already satisfied: azure-common<2.0.0,>=1.1 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from azure-ai-ml<2.0.0,>=1.9.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (1.1.28)\n",
-      "Requirement already satisfied: typing-extensions<5.0.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from azure-ai-ml<2.0.0,>=1.9.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (4.5.0)\n",
-      "Requirement already satisfied: opencensus-ext-azure<2.0.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from azure-ai-ml<2.0.0,>=1.9.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (1.1.9)\n",
-      "Requirement already satisfied: six>=1.11.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from azure-core<2.0.0,>=1.26.4->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (1.16.0)\n",
-      "Requirement already satisfied: msal<2.0.0,>=1.12.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from azure-identity<2.0.0,>=1.12.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (1.22.0)\n",
-      "Requirement already satisfied: msal-extensions<2.0.0,>=0.3.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from azure-identity<2.0.0,>=1.12.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (1.0.0)\n",
-      "Requirement already satisfied: cffi>=1.12 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from cryptography<42.0.0,>=41.0.2->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (1.15.1)\n",
-      "Requirement already satisfied: marshmallow-enum<2.0.0,>=1.5.1 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from dataclasses-json<0.6.0,>=0.5.7->langchain->-r ../../requirements.txt (line 6)) (1.5.1)\n",
-      "Requirement already satisfied: typing-inspect>=0.4.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from dataclasses-json<0.6.0,>=0.5.7->langchain->-r ../../requirements.txt (line 6)) (0.9.0)\n",
-      "Requirement already satisfied: alembic>=0.6.2 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from dataset<2.0.0,>=1.6.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (1.10.4)\n",
-      "Requirement already satisfied: banal>=1.0.1 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from dataset<2.0.0,>=1.6.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (1.0.6)\n",
-      "Requirement already satisfied: Werkzeug>=2.2.2 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from flask<3.0.0,>=2.2.3->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (2.3.6)\n",
-      "Requirement already satisfied: itsdangerous>=2.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from flask<3.0.0,>=2.2.3->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (2.1.2)\n",
-      "Requirement already satisfied: click>=8.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from flask<3.0.0,>=2.2.3->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (8.1.4)\n",
-      "Requirement already satisfied: importlib-metadata>=3.6.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from flask<3.0.0,>=2.2.3->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (6.8.0)\n",
-      "Requirement already satisfied: jaraco.classes in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from keyring<25.0.0,>=24.2.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (3.3.0)\n",
-      "Requirement already satisfied: pywin32-ctypes>=0.2.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from keyring<25.0.0,>=24.2.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (0.2.2)\n",
-      "Requirement already satisfied: packaging>=17.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from marshmallow<4.0.0,>=3.5->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (23.0)\n",
-      "Requirement already satisfied: python-dateutil>=2.8.1 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from pandas<2.0.0,>=1.5.3->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (2.8.2)\n",
-      "Requirement already satisfied: pytz>=2020.1 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from pandas<2.0.0,>=1.5.3->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (2023.3)\n",
-      "Requirement already satisfied: idna<4,>=2.5 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from requests->google-search-results==2.4.1->promptflow-tools==0.1.0.b3->-r ../../requirements.txt (line 4)) (3.4)\n",
-      "Requirement already satisfied: urllib3<3,>=1.21.1 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from requests->google-search-results==2.4.1->promptflow-tools==0.1.0.b3->-r ../../requirements.txt (line 4)) (1.26.16)\n",
-      "Requirement already satisfied: certifi>=2017.4.17 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from requests->google-search-results==2.4.1->promptflow-tools==0.1.0.b3->-r ../../requirements.txt (line 4)) (2023.5.7)\n",
-      "Requirement already satisfied: greenlet!=0.4.17 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from sqlalchemy<2.0.0,>=1.4.48->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (2.0.2)\n",
-      "Requirement already satisfied: soupsieve>1.2 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from beautifulsoup4->bs4->-r ../../requirements.txt (line 8)) (2.4.1)\n",
-      "Requirement already satisfied: Mako in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from alembic>=0.6.2->dataset<2.0.0,>=1.6.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (1.2.4)\n",
-      "Requirement already satisfied: pycparser in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from cffi>=1.12->cryptography<42.0.0,>=41.0.2->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (2.21)\n",
-      "Requirement already satisfied: zipp>=0.5 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from importlib-metadata>=3.6.0->flask<3.0.0,>=2.2.3->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (3.16.0)\n",
-      "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from jsonschema<5.0.0,>=4.0.0->azure-ai-ml<2.0.0,>=1.9.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (2023.6.1)\n",
-      "Requirement already satisfied: referencing>=0.28.4 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from jsonschema<5.0.0,>=4.0.0->azure-ai-ml<2.0.0,>=1.9.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (0.29.1)\n",
-      "Requirement already satisfied: rpds-py>=0.7.1 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from jsonschema<5.0.0,>=4.0.0->azure-ai-ml<2.0.0,>=1.9.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (0.8.10)\n",
-      "Requirement already satisfied: portalocker<3,>=1.6 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from msal-extensions<2.0.0,>=0.3.0->azure-identity<2.0.0,>=1.12.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (2.7.0)\n",
-      "Requirement already satisfied: requests-oauthlib>=0.5.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from msrest>=0.6.18->azure-ai-ml<2.0.0,>=1.9.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (1.3.1)\n",
-      "Requirement already satisfied: opencensus<1.0.0,>=0.11.2 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from opencensus-ext-azure<2.0.0->azure-ai-ml<2.0.0,>=1.9.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (0.11.2)\n",
-      "Requirement already satisfied: psutil>=5.6.3 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from opencensus-ext-azure<2.0.0->azure-ai-ml<2.0.0,>=1.9.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (5.9.0)\n",
-      "Requirement already satisfied: mypy-extensions>=0.3.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from typing-inspect>=0.4.0->dataclasses-json<0.6.0,>=0.5.7->langchain->-r ../../requirements.txt (line 6)) (1.0.0)\n",
-      "Requirement already satisfied: more-itertools in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from jaraco.classes->keyring<25.0.0,>=24.2.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (9.1.0)\n",
-      "Requirement already satisfied: opencensus-context>=0.1.3 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from opencensus<1.0.0,>=0.11.2->opencensus-ext-azure<2.0.0->azure-ai-ml<2.0.0,>=1.9.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (0.1.3)\n",
-      "Requirement already satisfied: google-api-core<3.0.0,>=1.0.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from opencensus<1.0.0,>=0.11.2->opencensus-ext-azure<2.0.0->azure-ai-ml<2.0.0,>=1.9.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (2.11.1)\n",
-      "Requirement already satisfied: pywin32>=226 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from portalocker<3,>=1.6->msal-extensions<2.0.0,>=0.3.0->azure-identity<2.0.0,>=1.12.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (305.1)\n",
-      "Requirement already satisfied: oauthlib>=3.0.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from requests-oauthlib>=0.5.0->msrest>=0.6.18->azure-ai-ml<2.0.0,>=1.9.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (3.2.2)\n",
-      "Requirement already satisfied: googleapis-common-protos<2.0.dev0,>=1.56.2 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from google-api-core<3.0.0,>=1.0.0->opencensus<1.0.0,>=0.11.2->opencensus-ext-azure<2.0.0->azure-ai-ml<2.0.0,>=1.9.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (1.59.1)\n",
-      "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0.dev0,>=3.19.5 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from google-api-core<3.0.0,>=1.0.0->opencensus<1.0.0,>=0.11.2->opencensus-ext-azure<2.0.0->azure-ai-ml<2.0.0,>=1.9.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (4.23.4)\n",
-      "Requirement already satisfied: google-auth<3.0.dev0,>=2.14.1 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from google-api-core<3.0.0,>=1.0.0->opencensus<1.0.0,>=0.11.2->opencensus-ext-azure<2.0.0->azure-ai-ml<2.0.0,>=1.9.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (2.21.0)\n",
-      "Requirement already satisfied: cachetools<6.0,>=2.0.0 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core<3.0.0,>=1.0.0->opencensus<1.0.0,>=0.11.2->opencensus-ext-azure<2.0.0->azure-ai-ml<2.0.0,>=1.9.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (5.3.1)\n",
-      "Requirement already satisfied: pyasn1-modules>=0.2.1 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core<3.0.0,>=1.0.0->opencensus<1.0.0,>=0.11.2->opencensus-ext-azure<2.0.0->azure-ai-ml<2.0.0,>=1.9.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (0.3.0)\n",
-      "Requirement already satisfied: rsa<5,>=3.1.4 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core<3.0.0,>=1.0.0->opencensus<1.0.0,>=0.11.2->opencensus-ext-azure<2.0.0->azure-ai-ml<2.0.0,>=1.9.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (4.9)\n",
-      "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages (from pyasn1-modules>=0.2.1->google-auth<3.0.dev0,>=2.14.1->google-api-core<3.0.0,>=1.0.0->opencensus<1.0.0,>=0.11.2->opencensus-ext-azure<2.0.0->azure-ai-ml<2.0.0,>=1.9.0->promptflow[azure]==0.0.101903259->-r ../../requirements.txt (line 3)) (0.5.0)\n",
-      "Note: you may need to restart the kernel to use updated packages.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "WARNING: Ignoring invalid distribution -romptflow-sdk (d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages)\n",
-      "WARNING: Ignoring invalid distribution -romptflow-sdk (d:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages)\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "%pip install -r ../../requirements.txt"
    ]
@@ -164,7 +49,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -192,7 +77,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -216,17 +101,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Found the config file in: D:\\enlistment\\aml\\promptflow\\prompt-flow\\examples\\.azureml\\config.json\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "# Get a handle to workspace\n",
     "ml_client = MLClient.from_config(credential=credential)\n",
@@ -269,7 +146,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -288,54 +165,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[39m\n",
-      "\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Portal url: https://ml.azure.com/prompts/flow/bulkrun/run/web_classification_default_20230818_174622_330767/details?wsid=/subscriptions/96aede12-2f73-41cb-b983-6d11a904839b/resourceGroups/promptflow/providers/Microsoft.MachineLearningServices/workspaces/promptflow-eastus&flight=promptfilestorage,PFSourceRun=false\n",
-      "name: web_classification_default_20230818_174622_330767\n",
-      "created_on: '2023-08-18T09:46:37.489886+00:00'\n",
-      "status: Running\n",
-      "display_name: web_classification_default_20230818_174622_330767\n",
-      "description: null\n",
-      "tags: {}\n",
-      "properties:\n",
-      "  azureml.promptflow.runtime_name: demo-mir\n",
-      "  azureml.promptflow.runtime_version: 20230808.v1\n",
-      "  azureml.promptflow.definition_file_name: flow.dag.yaml\n",
-      "  azureml.promptflow.snapshot_id: 773837e6-f837-4ca0-abce-8a1f7e143db2\n",
-      "creation_context:\n",
-      "  userObjectId: f7e94047-e084-4061-82ce-af31037a9d38\n",
-      "  userPuId: 10037FFE8983B2EB\n",
-      "  userIdp: null\n",
-      "  userAltSecId: null\n",
-      "  userIss: https://sts.windows.net/72f988bf-86f1-41af-91ab-2d7cd011db47/\n",
-      "  userTenantId: 72f988bf-86f1-41af-91ab-2d7cd011db47\n",
-      "  userName: Clement Wang\n",
-      "  upn: null\n",
-      "start_time: '2023-08-18T09:46:40.769985+00:00'\n",
-      "end_time: null\n",
-      "duration: null\n",
-      "portal_url: https://ml.azure.com/prompts/flow/bulkrun/run/web_classification_default_20230818_174622_330767/details?wsid=/subscriptions/96aede12-2f73-41cb-b983-6d11a904839b/resourceGroups/promptflow/providers/Microsoft.MachineLearningServices/workspaces/promptflow-eastus&flight=promptfilestorage,PFSourceRun=false\n",
-      "data: azureml://datastores/workspaceblobstore/paths/LocalUpload/70c38e61b28e21cb9ddcd67dbe209f05/data.jsonl\n",
-      "data_portal_url: https://ml.azure.com/data/datastore/workspaceblobstore/edit?wsid=/subscriptions/96aede12-2f73-41cb-b983-6d11a904839b/resourceGroups/promptflow/providers/Microsoft.MachineLearningServices/workspaces/promptflow-eastus&activeFilePath=LocalUpload/70c38e61b28e21cb9ddcd67dbe209f05/data.jsonl#browseTab\n",
-      "output: null\n",
-      "output_portal_url: null\n",
-      "\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "# create run\n",
     "base_run = pf.run(\n",
@@ -348,45 +180,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<promptflow._sdk.entities._run.Run at 0x1c5649b62b0>"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "pf.stream(base_run)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "InvalidRunStatusError",
-     "evalue": "Run 'web_classification_default_20230818_174622_330767' is not completed, the status is 'Running'. Please wait for its completion, or select other completed run(s).",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mInvalidRunStatusError\u001b[0m                     Traceback (most recent call last)",
-      "Cell \u001b[1;32mIn[8], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m details \u001b[39m=\u001b[39m pf\u001b[39m.\u001b[39;49mget_details(base_run)\n\u001b[0;32m      2\u001b[0m details\u001b[39m.\u001b[39mhead(\u001b[39m10\u001b[39m)\n",
-      "File \u001b[1;32md:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages\\promptflow\\azure\\_pf_client.py:150\u001b[0m, in \u001b[0;36mPFClient.get_details\u001b[1;34m(self, run)\u001b[0m\n\u001b[0;32m    148\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(run, Run):\n\u001b[0;32m    149\u001b[0m     run \u001b[39m=\u001b[39m run\u001b[39m.\u001b[39mname\n\u001b[1;32m--> 150\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mruns\u001b[39m.\u001b[39;49mget_details(run\u001b[39m=\u001b[39;49mrun)\n",
-      "File \u001b[1;32md:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages\\promptflow\\azure\\operations\\_run_operations.py:275\u001b[0m, in \u001b[0;36mRunOperations.get_details\u001b[1;34m(self, run, **kwargs)\u001b[0m\n\u001b[0;32m    267\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"Get the details from the run.\u001b[39;00m\n\u001b[0;32m    268\u001b[0m \n\u001b[0;32m    269\u001b[0m \u001b[39m:param run: The run\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    272\u001b[0m \u001b[39m:rtype: pandas.DataFrame\u001b[39;00m\n\u001b[0;32m    273\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m    274\u001b[0m run \u001b[39m=\u001b[39m Run\u001b[39m.\u001b[39m_validate_and_return_run_name(run)\n\u001b[1;32m--> 275\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_check_cloud_run_completed(run_name\u001b[39m=\u001b[39;49mrun)\n\u001b[0;32m    276\u001b[0m child_runs \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_get_child_runs_from_pfs(run)\n\u001b[0;32m    277\u001b[0m inputs, outputs \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_get_inputs_outputs_from_child_runs(child_runs)\n",
-      "File \u001b[1;32md:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages\\promptflow\\azure\\operations\\_run_operations.py:294\u001b[0m, in \u001b[0;36mRunOperations._check_cloud_run_completed\u001b[1;34m(self, run_name)\u001b[0m\n\u001b[0;32m    292\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"Check if the cloud run is completed.\"\"\"\u001b[39;00m\n\u001b[0;32m    293\u001b[0m run \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mget(run\u001b[39m=\u001b[39mrun_name)\n\u001b[1;32m--> 294\u001b[0m run\u001b[39m.\u001b[39;49m_check_run_status_is_completed()\n",
-      "File \u001b[1;32md:\\dev\\anaconda\\envs\\pf-examples\\lib\\site-packages\\promptflow\\_sdk\\entities\\_run.py:430\u001b[0m, in \u001b[0;36mRun._check_run_status_is_completed\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    428\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstatus \u001b[39m!=\u001b[39m RunStatus\u001b[39m.\u001b[39mFAILED:\n\u001b[0;32m    429\u001b[0m     error_message \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39m Please wait for its completion, or select other completed run(s).\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m--> 430\u001b[0m \u001b[39mraise\u001b[39;00m InvalidRunStatusError(error_message)\n",
-      "\u001b[1;31mInvalidRunStatusError\u001b[0m: Run 'web_classification_default_20230818_174622_330767' is not completed, the status is 'Running'. Please wait for its completion, or select other completed run(s)."
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "details = pf.get_details(base_run)\n",
     "details.head(10)"
diff --git a/scripts/readme/ghactions_driver/readme_step.py b/scripts/readme/ghactions_driver/readme_step.py
index 665f24f30f3..343c3da29f7 100644
--- a/scripts/readme/ghactions_driver/readme_step.py
+++ b/scripts/readme/ghactions_driver/readme_step.py
@@ -99,7 +99,7 @@ def get_workflow_step(self) -> str:
 
 class ExtractStepsAndRun(Step):
     def __init__(self) -> None:
-        Step.__init__(self, "Extract Steps")
+        Step.__init__(self, f"Extract Steps {ReadmeSteps.readme_name}")
 
     def get_workflow_step(self) -> str:
         template = Step.get_workflow_template("step_extract_steps_and_run.yml.jinja2")
@@ -107,14 +107,14 @@ def get_workflow_step(self) -> str:
             {
                 "step_name": self.workflow_name,
                 "working_dir": ReadmeSteps.working_dir,
-                "readme_name": (Path(ReadmeSteps.working_dir) / "README.md").as_posix(),
+                "readme_name": ReadmeSteps.readme_name,
             }
         )
 
 
 class ExtractStepsAndRunGPTFour(Step):
     def __init__(self) -> None:
-        Step.__init__(self, "Extract Steps")
+        Step.__init__(self, f"Extract Steps {ReadmeSteps.readme_name}")
 
     def get_workflow_step(self) -> str:
         template = Step.get_workflow_template(
@@ -124,7 +124,7 @@ def get_workflow_step(self) -> str:
             {
                 "step_name": self.workflow_name,
                 "working_dir": ReadmeSteps.working_dir,
-                "readme_name": (Path(ReadmeSteps.working_dir) / "README.md").as_posix(),
+                "readme_name": ReadmeSteps.readme_name,
             }
         )
 
@@ -176,6 +176,7 @@ class ReadmeSteps:
     """
 
     step_array = []  # Record steps
+    readme_name = ""  # Record readme name
     working_dir = ""  # the working directory of flow, relative to git_base_dir
     template = ""  # Select a base template under workflow_templates folder
     workflow = ""  # Target workflow name to be generated
@@ -231,7 +232,9 @@ def extract_steps_and_run_gpt_four() -> Step:
     # endregion steps
 
     @staticmethod
-    def setup_target(working_dir: str, template: str, target: str) -> str:
+    def setup_target(
+        working_dir: str, template: str, target: str, readme_name: str
+    ) -> str:
         """
         Used at the very head of jinja template to indicate basic information
         """
@@ -239,6 +242,7 @@ def setup_target(working_dir: str, template: str, target: str) -> str:
         ReadmeSteps.template = template
         ReadmeSteps.workflow = target
         ReadmeSteps.step_array = []
+        ReadmeSteps.readme_name = readme_name
         return ""
 
     @staticmethod
@@ -308,3 +312,4 @@ def write_workflow(
         output_telemetry.workflow_name = workflow_name
         output_telemetry.target_path = target_path
         output_telemetry.readme_folder = ReadmeSteps.working_dir
+        output_telemetry.readme_name = ReadmeSteps.readme_name
diff --git a/scripts/readme/ghactions_driver/readme_templates/README.md.jinja2 b/scripts/readme/ghactions_driver/readme_templates/README.md.jinja2
index fbafbd1f7b8..8378e698ba4 100644
--- a/scripts/readme/ghactions_driver/readme_templates/README.md.jinja2
+++ b/scripts/readme/ghactions_driver/readme_templates/README.md.jinja2
@@ -21,6 +21,13 @@
 
 ## CLI examples
 
+### Tutorials ([tutorials](tutorials))
+
+| path | status | description |
+------|--------|-------------
+{% for tutorial in tutorials.readmes %}| [{{ tutorial.name }}]({{ tutorial.path }}) | [![{{tutorial.pipeline_name}}](https://github.com/microsoft/promptflow/actions/workflows/{{tutorial.yaml_name}}/badge.svg?branch={{branch}})](https://github.com/microsoft/promptflow/actions/workflows/{{tutorial.yaml_name}}) | {{ tutorial.description }} |
+{% endfor %}
+
 ### Flows ([flows](flows))
 
 #### [Standard flows](flows/standard/) 
diff --git a/scripts/readme/ghactions_driver/readme_workflow_generate.py b/scripts/readme/ghactions_driver/readme_workflow_generate.py
index db57dd4dfa3..3b7c9075e34 100644
--- a/scripts/readme/ghactions_driver/readme_workflow_generate.py
+++ b/scripts/readme/ghactions_driver/readme_workflow_generate.py
@@ -8,17 +8,24 @@ def write_readme_workflow(readme_path, output_telemetry=Telemetry()):
     relative_path = Path(readme_path).relative_to(
         Path(ReadmeStepsManage.git_base_dir())
     )
-    workflow_path = relative_path.as_posix()
+    workflow_path = relative_path.parent.as_posix()
     relative_name_path = Path(readme_path).relative_to(
         Path(ReadmeStepsManage.git_base_dir()) / "examples"
     )
-    workflow_name = relative_name_path.as_posix().replace("/", "_").replace("-", "_")
+    workflow_name = (
+        relative_name_path.as_posix()
+        .replace(".md", "")
+        .replace("/README", "")
+        .replace("/", "_")
+        .replace("-", "_")
+    )
     workflow_name = "samples_" + workflow_name
 
     ReadmeSteps.setup_target(
         workflow_path,
         "basic_workflow_replace.yml.jinja2",
         f"{workflow_name}.yml",
+        relative_path.as_posix(),
     )
     ReadmeSteps.install_dependencies()
     ReadmeSteps.install_dev_dependencies()
diff --git a/scripts/readme/ghactions_driver/workflow_steps/step_env_create_aoai.yml.jinja2 b/scripts/readme/ghactions_driver/workflow_steps/step_env_create_aoai.yml.jinja2
index 2e8d1279e36..38c7f236056 100644
--- a/scripts/readme/ghactions_driver/workflow_steps/step_env_create_aoai.yml.jinja2
+++ b/scripts/readme/ghactions_driver/workflow_steps/step_env_create_aoai.yml.jinja2
@@ -1,5 +1,8 @@
 - name: {{ step_name }}
   working-directory: {{ working_dir }}
   run: |
-    pf connection create --file .env --name {{ connection_name }}
-    pf connection list
\ No newline at end of file
+    if [[ -e .env ]]; then
+      pf connection create --file .env --name {{ connection_name }}
+      pf connection list
+    fi
+    
\ No newline at end of file
diff --git a/scripts/readme/ghactions_driver/workflow_steps/step_extract_steps_and_run.yml.jinja2 b/scripts/readme/ghactions_driver/workflow_steps/step_extract_steps_and_run.yml.jinja2
index 0ca900c66d7..c4140fd1704 100644
--- a/scripts/readme/ghactions_driver/workflow_steps/step_extract_steps_and_run.yml.jinja2
+++ b/scripts/readme/ghactions_driver/workflow_steps/step_extract_steps_and_run.yml.jinja2
@@ -1,7 +1,11 @@
 - name: {{ step_name }}
   working-directory: ${{ '{{' }} github.workspace }}
   run: |
-    python scripts/readme/extract_steps_from_readme.py -f {{ working_dir }}/README.md -o {{ working_dir }}
+    python scripts/readme/extract_steps_from_readme.py -f {{ readme_name }} -o {{ working_dir }}
+- name: Cat script
+  working-directory: {{ working_dir }}
+  run: |
+    cat bash_script.sh
 - name: Run scripts
   working-directory: {{ working_dir }}
   run: |
diff --git a/scripts/readme/ghactions_driver/workflow_steps/step_extract_steps_and_run_gpt4.yml.jinja2 b/scripts/readme/ghactions_driver/workflow_steps/step_extract_steps_and_run_gpt4.yml.jinja2
index 0ca900c66d7..c4140fd1704 100644
--- a/scripts/readme/ghactions_driver/workflow_steps/step_extract_steps_and_run_gpt4.yml.jinja2
+++ b/scripts/readme/ghactions_driver/workflow_steps/step_extract_steps_and_run_gpt4.yml.jinja2
@@ -1,7 +1,11 @@
 - name: {{ step_name }}
   working-directory: ${{ '{{' }} github.workspace }}
   run: |
-    python scripts/readme/extract_steps_from_readme.py -f {{ working_dir }}/README.md -o {{ working_dir }}
+    python scripts/readme/extract_steps_from_readme.py -f {{ readme_name }} -o {{ working_dir }}
+- name: Cat script
+  working-directory: {{ working_dir }}
+  run: |
+    cat bash_script.sh
 - name: Run scripts
   working-directory: {{ working_dir }}
   run: |
diff --git a/scripts/readme/ghactions_driver/workflow_steps/step_install_deps.yml.jinja2 b/scripts/readme/ghactions_driver/workflow_steps/step_install_deps.yml.jinja2
index 46b8b463cd4..538b01e2061 100644
--- a/scripts/readme/ghactions_driver/workflow_steps/step_install_deps.yml.jinja2
+++ b/scripts/readme/ghactions_driver/workflow_steps/step_install_deps.yml.jinja2
@@ -1,5 +1,7 @@
 - name: {{ step_name }}
   working-directory: {{ working_dir }}
   run: |
-    python -m pip install --upgrade pip
-    pip install -r requirements.txt
\ No newline at end of file
+    if [[ -e requirements.txt ]]; then
+      python -m pip install --upgrade pip
+      pip install -r requirements.txt
+    fi
\ No newline at end of file
diff --git a/scripts/readme/ghactions_driver/workflow_templates/pdf_workflow.yml.jinja2 b/scripts/readme/ghactions_driver/workflow_templates/pdf_workflow.yml.jinja2
index b3ea5e29603..7ae80420a2a 100644
--- a/scripts/readme/ghactions_driver/workflow_templates/pdf_workflow.yml.jinja2
+++ b/scripts/readme/ghactions_driver/workflow_templates/pdf_workflow.yml.jinja2
@@ -20,7 +20,7 @@ steps:
       python -m pip install --upgrade pip
       pip install -r ${{ '{{' }} github.workspace }}/examples/requirements.txt
       pip install -r ${{ '{{' }} github.workspace }}/examples/dev_requirements.txt
-  - name: Create Aoai Connection
+  - name: Create Chat With PDF Custom Connection
     working-directory: {{ gh_working_dir }}
     run: |
       AOAI_API_KEY=${{ '{{' }} secrets.AOAI_API_KEY_TEST }}
@@ -32,6 +32,14 @@ steps:
         mv .env.example .env
         pf connection create --file .env --name chat_with_pdf_custom_connection
       fi
+  - name: Create AOAI Connection
+    working-directory: {{ gh_working_dir }}
+    run: |
+      AOAI_API_KEY=${{ '{{' }} secrets.AOAI_API_KEY_TEST }}
+      AOAI_API_ENDPOINT=${{ '{{' }} secrets.AOAI_API_ENDPOINT_TEST }}
+      if [[ -e azure_openai.yml ]]; then
+        pf connection create --file azure_openai.yml --set api_key=$AOAI_API_KEY api_base=$AOAI_API_ENDPOINT
+      fi
   - name: Azure Login
     uses: azure/login@v1
     with:
diff --git a/scripts/readme/readme.py b/scripts/readme/readme.py
index 9fc1b6a58eb..b34c507661d 100644
--- a/scripts/readme/readme.py
+++ b/scripts/readme/readme.py
@@ -159,14 +159,17 @@ def write_readme(workflow_telemetrys, readme_telemetrys):
     # Adjust tutorial names:
 
     for readme_telemetry in readme_telemetrys:
-        notebook_name = readme_telemetry.readme_folder.split("/")[-1]
-        notebook_path = (
-            readme_telemetry.readme_folder.replace("examples/", "") + "/README.md"
-        )
+        if readme_telemetry.readme_name.endswith("README.md"):
+            notebook_name = readme_telemetry.readme_folder.split("/")[-1]
+        else:
+            notebook_name = readme_telemetry.readme_name.split("/")[-1].replace(
+                ".md", ""
+            )
+        notebook_path = readme_telemetry.readme_name.replace("examples/", "")
         pipeline_name = readme_telemetry.workflow_name
         yaml_name = f"{readme_telemetry.workflow_name}.yml"
         description = get_readme_description_first_sentence(
-            readme_telemetry.readme_folder + "/README.md"
+            readme_telemetry.readme_name
         )
         readme_folder = readme_telemetry.readme_folder
 
@@ -270,6 +273,7 @@ def write_readme(workflow_telemetrys, readme_telemetrys):
     input_glob_readme = [
         "examples/flows/**/README.md",
         "examples/connections/**/README.md",
+        "examples/tutorials/**/chat*.md",
     ]
     readme_telemetrys = []
     readme_generator.main(input_glob_readme, readme_telemetrys)
diff --git a/scripts/readme/readme_generator.py b/scripts/readme/readme_generator.py
index c0e45a2e8e0..9170761906a 100644
--- a/scripts/readme/readme_generator.py
+++ b/scripts/readme/readme_generator.py
@@ -41,7 +41,7 @@ def main(input_glob, output_files=[]):
 
     for readme in readme_items:
         readme_telemetry = Telemetry()
-        workflow_name = readme.parent.relative_to(ReadmeStepsManage.git_base_dir())
+        workflow_name = readme.relative_to(ReadmeStepsManage.git_base_dir())
         # Deal with readme
         write_readme_workflow(workflow_name.resolve(), readme_telemetry)
         ReadmeSteps.cleanup()