Lisal/math to code (microsoft#216)

# Description - Add maths-to-code examples, including a standard flow and corresponding evaluation flow - Test data and a test notebook is provided in the maths-to-code flow folder - Fix run name assertion logic in "chat with pdf" flow Please add an informative description that covers that changes made by the pull request and link all relevant issues. # All Promptflow Contribution checklist: - [ ] **The pull request does not introduce [breaking changes]** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](../CONTRIBUTING.md).** ## General Guidelines and Best Practices - [ ] Title of the pull request is clear and informative. - [ ] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes.
w-javed · Aug 30, 2023 · 23cb98f · 23cb98f
1 parent e8ddeef
commit 23cb98f
Show file tree

Hide file tree

Showing 16 changed files with 1,447 additions and 1 deletion.
diff --git a/.github/workflows/samples_flows_standard_maths_to_code.yml b/.github/workflows/samples_flows_standard_maths_to_code.yml
@@ -0,0 +1,89 @@
+# This code is autogenerated.
+# Code is generated by running custom script: python3 readme.py
+# Any manual changes to this file may cause incorrect behavior.
+# Any manual changes will be overwritten if the code is regenerated.
+
+name: samples_flows_standard_maths_to_code
+on:
+  schedule:
+    - cron: "39 19 * * *" # Every day starting at 3:39 BJT
+  pull_request:
+    branches: [ main ]
+    paths: [ examples/flows/standard/maths-to-code/**, examples/*requirements.txt, .github/workflows/samples_flows_standard_maths_to_code.yml ]
+  workflow_dispatch:
+
+jobs:
+  samples_readme_ci:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Setup Python 3.9 environment
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.9"
+      - name: Generate config.json
+        run: echo ${{ secrets.TEST_WORKSPACE_CONFIG_JSON }} > ${{ github.workspace }}/examples/config.json
+      - name: Prepare requirements
+        working-directory: examples
+        run: |
+          if [[ -e requirements.txt ]]; then
+            python -m pip install --upgrade pip
+            pip install -r requirements.txt
+          fi
+      - name: Prepare dev requirements
+        working-directory: examples
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r dev_requirements.txt
+      - name: Refine .env file
+        working-directory: examples/flows/standard/maths-to-code
+        run: |
+          AOAI_API_KEY=${{ secrets.AOAI_API_KEY_TEST }}
+          AOAI_API_ENDPOINT=${{ secrets.AOAI_API_ENDPOINT_TEST }}
+          AOAI_API_ENDPOINT=$(echo ${AOAI_API_ENDPOINT//\//\\/})
+          if [[ -e .env.example ]]; then
+            echo "env replacement"
+            sed -i -e "s/<your_AOAI_key>/$AOAI_API_KEY/g" -e "s/<your_AOAI_endpoint>/$AOAI_API_ENDPOINT/g" .env.example
+            mv .env.example .env
+          fi
+      - name: Create run.yml
+        working-directory: examples/flows/standard/maths-to-code
+        run: |
+          gpt_base=${{ secrets.AOAI_API_ENDPOINT_TEST }}
+          gpt_base=$(echo ${gpt_base//\//\\/})
+          if [[ -e run.yml ]]; then
+            sed -i -e "s/\${azure_open_ai_connection.api_key}/${{ secrets.AOAI_API_KEY_TEST }}/g" -e "s/\${azure_open_ai_connection.api_base}/$gpt_base/g" run.yml
+          fi
+      - name: Azure Login
+        uses: azure/login@v1
+        with:
+          creds: ${{ secrets.AZURE_CREDENTIALS }}
+      - name: Extract Steps examples/flows/standard/maths-to-code/README.md
+        working-directory: ${{ github.workspace }}
+        run: |
+          python scripts/readme/extract_steps_from_readme.py -f examples/flows/standard/maths-to-code/README.md -o examples/flows/standard/maths-to-code
+      - name: Cat script
+        working-directory: examples/flows/standard/maths-to-code
+        run: |
+          cat bash_script.sh
+      - name: Run scripts
+        working-directory: examples/flows/standard/maths-to-code
+        run: |
+          export aoai_api_key=${{secrets.AOAI_API_KEY_TEST }}
+          export aoai_api_endpoint=${{ secrets.AOAI_API_ENDPOINT_TEST }}
+          export test_workspace_sub_id=${{ secrets.TEST_WORKSPACE_SUB_ID }}
+          export test_workspace_rg=${{ secrets.TEST_WORKSPACE_RG }}
+          export test_workspace_name=${{ secrets.TEST_WORKSPACE_NAME }}
+          bash bash_script.sh
+      - name: Pip List for Debug
+        if : ${{ always() }}
+        working-directory: examples/flows/standard/maths-to-code
+        run: |
+          pip list
+      - name: Upload artifact
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: artifact
+          path: examples/flows/standard/maths-to-code/bash_script.sh
diff --git a/examples/flows/chat/chat-with-pdf/tests/base_test.py b/examples/flows/chat/chat-with-pdf/tests/base_test.py
@@ -110,7 +110,7 @@ def create_eval_run(
     def check_run_basics(self, run, display_name=None):
         self.assertTrue(run is not None)
         if display_name is not None:
-            self.assertEqual(run.display_name, display_name)
+            self.assertTrue(run.display_name.find(display_name) != -1)
         self.assertEqual(run.tags["unittest"], "true")
 
     def run_eval_with_config(

diff --git a/examples/flows/evaluation/eval-accuracy-maths-to-code/aggregate.py b/examples/flows/evaluation/eval-accuracy-maths-to-code/aggregate.py
@@ -0,0 +1,37 @@
+from typing import List
+from promptflow import tool
+from promptflow import log_metric
+
+
+@tool
+def accuracy_aggregate(processed_results: List[int]):
+
+    num_exception = 0
+    num_correct = 0
+
+    for i in range(len(processed_results)):
+        if processed_results[i] == -1:
+            num_exception += 1
+        elif processed_results[i] == 1:
+            num_correct += 1
+
+    num_total = len(processed_results)
+    accuracy = round(1.0 * num_correct / num_total, 2)
+    error_rate = round(1.0 * num_exception / num_total, 2)
+
+    log_metric(key="accuracy", value=accuracy)
+    log_metric(key="error_rate", value=error_rate)
+
+    return {
+        "num_total": num_total,
+        "num_correct": num_correct,
+        "num_exception": num_exception,
+        "accuracy": accuracy,
+        "error_rate": error_rate
+    }
+
+
+if __name__ == "__main__":
+    numbers = [1, 1, 1, 1, 0, -1, -1]
+    accuracy = accuracy_aggregate(numbers)
+    print("The accuracy is", accuracy)
diff --git a/examples/flows/evaluation/eval-accuracy-maths-to-code/flow.dag.yaml b/examples/flows/evaluation/eval-accuracy-maths-to-code/flow.dag.yaml
@@ -0,0 +1,29 @@
+$schema: https://azuremlschemas.azureedge.net/promptflow/latest/Flow.schema.json
+inputs:
+  groundtruth:
+    type: string
+    default: "1"
+  prediction:
+    type: string
+    default: "2"
+outputs:
+  score:
+    type: string
+    reference: ${line_process.output}
+nodes:
+- name: line_process
+  type: python
+  source:
+    type: code
+    path: line_process.py
+  inputs:
+    groundtruth: ${inputs.groundtruth}
+    prediction: ${inputs.prediction}
+- name: aggregate
+  type: python
+  source:
+    type: code
+    path: aggregate.py
+  inputs:
+    processed_results: ${line_process.output}
+  aggregation: true
diff --git a/examples/flows/evaluation/eval-accuracy-maths-to-code/line_process.py b/examples/flows/evaluation/eval-accuracy-maths-to-code/line_process.py
@@ -0,0 +1,40 @@
+from promptflow import tool
+
+
+@tool
+def line_process(groundtruth: str, prediction: str) -> int:
+
+    processed_result = 0
+
+    if prediction == "JSONDecodeError" or prediction.startswith("Unknown Error:"):
+        processed_result = -1
+        return processed_result
+
+    try:
+        groundtruth = float(groundtruth)
+        prediction = float(prediction)
+    except ValueError:
+        processed_result = -1
+        return processed_result
+
+    if round(prediction, 2) == round(groundtruth, 2):
+        processed_result = 1
+
+    return processed_result
+
+
+if __name__ == "__main__":
+    processed_result = line_process("1.0", "1")
+    print("The processed result is", processed_result)
+
+    processed_result = line_process("3.14", "3.1415926")
+    print("The processed result is", processed_result)
+
+    processed_result = line_process("2.1", "2.0")
+    print("The processed result is", processed_result)
+
+    processed_result = line_process("1.0", "JSONDecodeError")
+    print("The processed result is", processed_result)
+
+    processed_result = line_process("1.0", "No module named 'numpy'")
+    print("The processed result is", processed_result)
diff --git a/examples/flows/evaluation/eval-accuracy-maths-to-code/test_data.jsonl b/examples/flows/evaluation/eval-accuracy-maths-to-code/test_data.jsonl
@@ -0,0 +1,20 @@
+{"question": "What is the sum of 5 and 3?", "groundtruth": "8", "answer": "8"}
+{"question": "Subtract 7 from 10.", "groundtruth": "3", "answer": "3"}
+{"question": "Multiply 6 by 4.", "groundtruth": "24", "answer": "24"}
+{"question": "Divide 20 by 5.", "groundtruth": "4", "answer": "4"}
+{"question": "What is the square of 7?", "groundtruth": "49", "answer": "49"}
+{"question": "What is the square root of 81?", "groundtruth": "9", "answer": "9"}
+{"question": "If a rectangle has a length of 10 and width of 5, what is the area?", "groundtruth": "50", "answer": "50"}
+{"question": "A circle has a radius of 7, what is the area? (Use 3.14 for pi)", "groundtruth": "153.86", "answer": "153.871"}
+{"question": "Solve for x in the equation 2x + 3 = 9.", "groundtruth": "3", "answer": "3"}
+{"question": "What is the value of x if 5x = 25?", "groundtruth": "5", "answer": "5"}
+{"question": "A car travels 200 miles in 4 hours. What is the average speed of the car?", "groundtruth": "50", "answer": "50"}
+{"question": "A car travels at a speed of 60 mph. How long will it take to travel 180 miles?", "groundtruth": "3", "answer": "3"}
+{"question": "If a car travels at a speed of 40 mph for 2 hours, how far will it travel?","groundtruth": "80", "answer": "80"}
+{"question":"A rectangle has length = 10 cm and width = 5 cm. What is its area?", "groundtruth":"50", "answer": "50"}
+{"question":"A circle has radius = 7 cm. What is its circumference? (Use pi =3.14)", "groundtruth":"43.96", "answer": "43.959"}
+{"question":"A triangle has base =10 cm and height =5 cm. What is its area?", "groundtruth":"25", "answer": "25"}
+{"question":"What is the slope of the line that passes through (2,3) and (4,7)?", "groundtruth":"2", "answer": "2"}
+{"question":"The distance between A and B is 2000km, A is moving towards B with speed 80km/hour, meanwhile B is moving towards A with speed 120km/hour, how many hours later A and B can meet?", "groundtruth":"10", "answer": "10"}
+{"question":"The lengths of the two perpendicular sides of a right triangle are 6cm and 8cm. What is the length of the hypotenuse?", "groundtruth": "10", "answer": "10"}
+{"question":"A is running with average speed 10km/hour, A already run half hour. B start to chase A along the same route with average speed 15km/hour, how many hours B will take to meet A?", "groundtruth":"1", "answer": "2"}
diff --git a/examples/flows/standard/maths-to-code/README.md b/examples/flows/standard/maths-to-code/README.md
@@ -0,0 +1,79 @@
+# Math to Code
+Math to Code is a project that utilizes the power of the chatGPT model to generate code that models math questions and then executes the generated code to obtain the final numerical answer.
+
+> [!NOTE]
+>
+> Building a system that generates executable code from user input with LLM is [a complex problem with potential security risks](
+https://developer.nvidia.com/blog/securing-llm-systems-against-prompt-injection/
+), this example is more of a demonstration rather than something you can directly use in production. To build such system correctly, you should address key security considerations like input validation, additional sanitization of the code generated or better run the generated code in a sandbox environment.
+
+Tools used in this flow：
+
+- `python` tool
+- built-in `llm` tool
+
+Connections used in this flow:
+
+- `open_ai` connection
+
+## Prerequisites
+Install promptflow sdk and other dependencies:
+
+```cmd
+pip install -r requirements.txt
+```
+
+## Setup connection
+Prepare your Azure Open AI resource follow this [instruction](https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal) and get your `api_key` if you don't have one.
+
+Note in this example, we are using [chat api](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/chatgpt?pivots=programming-language-chat-completions), please use `gpt-35-turbo` or `gpt-4` model deployment.
+
+Create connection if you haven't done that. Ensure you have put your azure open ai endpoint key in [azure_openai.yml](azure_openai.yml) file. 
+```bash
+# Override keys with --set to avoid yaml file changes
+pf connection create -f ../../../connections/azure_openai.yml --set api_key=<your_api_key> api_base=<your_api_base>
+```
+
+Ensure you have created `open_ai_connection` connection.
+```bash
+pf connection show -n open_ai_connection
+```
+
+
+## Run flow in local
+
+### Run locally with single line input
+
+```bash
+# test with default input value in flow.dag.yaml
+pf flow test --flow .
+# test with specific input
+pf flow test --flow . --inputs math_question='If a rectangle has a length of 10 and width of 5, what is the area?'
+```
+
+### Run with multiple lines data
+
+- create run
+```bash
+# create a random run name
+run_name="math_to_code_"$(openssl rand -hex 12)
+pf run create --flow . --data ./math_data.jsonl --column-mapping math_question='${data.question}' --name $run_name --stream
+```
+
+### Get the accuracy using evaluation flow
+Use [eval-accuracy-maths-to-code](../../evaluation/eval-accuracy-maths-to-code/) to evaluate accuracy and error rate metrics against the math-to-code flow.
+
+- accuracy: if the generated code can be correctly executed and got final number answer, it will be compare with the groundtruth in the test data. For single instance, it's True if the final number equals to the groundtruth, False otherwise. Accuracy is to measure the correct percentage against test data.
+- error_rate: some case the flow cannot get number answer, for example, the generated code cannot be executed due to code parsing error of dependent package not available in conda env. Error rate is to measure the percentage of this case in test data. 
+
+```bash
+# create a random eval run name
+eval_run_name="math_to_code_eval_run_"$(openssl rand -hex 12)
+
+# invoke accuracy and error rate evaluation against math-to-code batch run
+pf run create --flow ../../evaluation/eval-accuracy-maths-to-code/ --data ./math_data.jsonl --column-mapping groundtruth='${data.answer}' prediction='${run.outputs.answer}' --run $run_name --name $eval_run_name --stream
+
+# view the run details
+pf run show-details -n $eval_run_name
+pf run show-metrics -n $eval_run_name
+```
diff --git a/examples/flows/standard/maths-to-code/ask_llm.jinja2 b/examples/flows/standard/maths-to-code/ask_llm.jinja2
@@ -0,0 +1,23 @@
+system:
+I want you to act as a Math expert specializing in Algebra, Geometry, and Calculus. Given the question, develop python code to model the user's question.
+The python code will print the result at the end.
+Please generate executable python code, your reply will be in JSON format, something like:
+{
+    "code": "print(1+1)"
+}
+
+user:
+This a set of examples including question and the final answer:
+{% for ex in examples %}
+QUESTION: {{ ex.question }}
+CODE:
+{{ ex.code }}
+
+{% endfor %}
+
+Now come to the real task, make sure return a valid json. The json should contain a key named "code" and the value is the python code. For example:
+{
+    "code": "print(1+1)"
+}
+QUESTION: {{ question }}
+CODE:
diff --git a/examples/flows/standard/maths-to-code/code_execution.py b/examples/flows/standard/maths-to-code/code_execution.py
@@ -0,0 +1,33 @@
+from promptflow import tool
+
+import sys
+from io import StringIO
+
+
+@tool
+def func_exe(code_snippet: str):
+    if code_snippet == "JSONDecodeError" or code_snippet.startswith("Unknown Error:"):
+        return code_snippet
+
+    # Define the result variable before executing the code snippet
+    old_stdout = sys.stdout
+    redirected_output = sys.stdout = StringIO()
+
+    # Execute the code snippet
+    try:
+        exec(code_snippet.lstrip())
+    except Exception as e:
+        sys.stdout = old_stdout
+        return str(e)
+
+    sys.stdout = old_stdout
+    return redirected_output.getvalue().strip()
+
+
+if __name__ == "__main__":
+    print(func_exe("print(5+3)"))
+    print(func_exe("count = 0\nfor i in range(100):\n    if i % 8 == 0:\n        count += 1\nprint(count)"))
+    print(func_exe("sum = 0\ni = 0\nwhile 3**i < 100:\n    sum += 3**i\n    i += 1\nprint(sum)"))
+    print(func_exe("speed_A = 80\nspeed_B = 120\ndistance = 2000\ntime = distance / (speed_A + speed_B)\nprint(time)"))
+    print(func_exe("Unknown Error"))
+    print(func_exe("JSONDecodeError"))