Skip to content

Commit 6a90f6f

Browse files
authored
Merge pull request #585 from Cloud-Code-AI/584-improve-code-review-with-multiprompt-approach
feat: enhance pr review process and code quality
2 parents fad79b6 + 2e82d48 commit 6a90f6f

File tree

8 files changed

+122
-93
lines changed

8 files changed

+122
-93
lines changed

.experiments/code_review/dataset/pr_5/issues.json

+3-16
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@
2121
"suggested_code": "response = completion(\n model=os.environ.get(\"model\", \"anyscale/mistralai/Mixtral-8x22B-Instruct-v0.1\"), messages=messages\n)",
2222
"fixed_code": "import time\n\nfor attempt in range(3):\n try:\n response = completion(\n model=os.environ.get(\"model\", \"anyscale/mistralai/Mixtral-8x22B-Instruct-v0.1\"), messages=messages\n )\n break\n except Exception as e:\n if attempt < 2:\n time.sleep(2 ** attempt)\n else:\n raise e",
2323
"file_path": "main.py",
24-
"start_line": 66,
25-
"end_line": 68,
24+
"start_line": 69,
25+
"end_line": 71,
2626
"severity": 9
2727
},
2828
{
@@ -38,19 +38,6 @@
3838
"end_line": 84,
3939
"severity": 8
4040
},
41-
{
42-
"category": "Inefficient Progress Printing",
43-
"description": "The progress printing method is inefficient.",
44-
"impact": "high",
45-
"rationale": "Printing progress in this manner can be slow and resource-intensive.",
46-
"recommendation": "Use a more efficient method for printing progress, such as updating the progress less frequently or using a dedicated progress reporting library like tqdm.",
47-
"suggested_code": "print(f\"\\rProgress:[{'=' * int(50 * progress):<50}]{progress:.0%}\", end=\"\", flush=True)",
48-
"fixed_code": "if index % max(1, len(df) // 100) == 0: # Update every 1%\n print(f\"\\rProgress:[{'=' * int(50 * progress):<50}]{progress:.0%}\", end=\"\", flush=True)",
49-
"file_path": "main.py",
50-
"start_line": 121,
51-
"end_line": 122,
52-
"severity": 5
53-
},
5441
{
5542
"category": "Redundant Code",
5643
"description": "The check for an empty DataFrame is redundant.",
@@ -88,6 +75,6 @@
8875
"file_path": "main.py",
8976
"start_line": 174,
9077
"end_line": 175,
91-
"severity": 6
78+
"severity": 8
9279
}
9380
]

.experiments/code_review/main.py

+18-4
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import datetime
44
import logging
55
from tqdm import tqdm
6+
from kaizen.helpers import parser
67
from kaizen.reviewer.code_review import CodeReviewer
78
from kaizen.llms.provider import LLMProvider
89
from github_app.github_helper.utils import get_diff_text, get_pr_files
@@ -45,6 +46,15 @@ def process_pr(pr_url, reeval_response=False):
4546
model="best",
4647
)
4748

49+
combined_diff_data = ""
50+
for file in pr_files:
51+
patch_details = file.get("patch")
52+
filename = file.get("filename", "").replace(" ", "")
53+
combined_diff_data = (
54+
combined_diff_data
55+
+ f"\n---->\nFile Name: {filename}\nPatch Details: {parser.patch_to_combined_chunks(patch_details)}"
56+
)
57+
4858
# topics = clean_keys(review_data.topics, "important")
4959
logger.info(review_data.topics)
5060
review_desc = create_pr_review_text(
@@ -57,16 +67,17 @@ def process_pr(pr_url, reeval_response=False):
5767
comments, topics = create_review_comments(review_data.topics)
5868
logger.info(f"Model: {review_data.model_name}\nUsage: {review_data.usage}")
5969
logger.info(f"Completed processing PR: {pr_url}")
60-
return review_desc, comments, review_data.issues
70+
return review_desc, comments, review_data.issues, combined_diff_data
6171

6272

63-
def save_review(pr_number, review_desc, comments, issues, folder):
73+
def save_review(pr_number, review_desc, comments, issues, folder, combined_diff_data):
6474
folder = os.path.join(folder, f"pr_{pr_number}")
6575
logger.info(f"Saving review for PR {pr_number} in {folder}")
6676
os.makedirs(folder, exist_ok=True)
6777
review_file = os.path.join(folder, "review.md")
6878
comments_file = os.path.join(folder, "comments.json")
6979
issues_file = os.path.join(folder, "issues.json")
80+
combined_diff = os.path.join(folder, "combined_diff.txt")
7081

7182
with open(review_file, "w") as f:
7283
f.write(review_desc)
@@ -76,6 +87,9 @@ def save_review(pr_number, review_desc, comments, issues, folder):
7687

7788
with open(issues_file, "w") as f:
7889
json.dump(issues, f, indent=2)
90+
91+
with open(combined_diff, 'w') as f:
92+
f.write(combined_diff_data)
7993

8094
logger.info(f"Saved review files for PR {pr_number}")
8195

@@ -97,8 +111,8 @@ def main(pr_urls):
97111
logger.info(f"Starting to process PR {pr_number}")
98112

99113
# Without re-evaluation
100-
review_desc, comments, issues = process_pr(pr_url, reeval_response=False)
101-
save_review(pr_number, review_desc, comments, issues, no_eval_folder)
114+
review_desc, comments, issues, combined_diff_data = process_pr(pr_url, reeval_response=False)
115+
save_review(pr_number, review_desc, comments, issues, no_eval_folder, combined_diff_data)
102116

103117
# # With re-evaluation
104118
# review_desc, comments, topics = process_pr(pr_url, reeval_response=True)

kaizen/generator/pr_description.py

+8-12
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ def _process_files_generator(
120120
CODE_DIFF="",
121121
)
122122
)
123-
123+
diff_parts = []
124124
for file in pull_request_files:
125125
patch_details = file.get("patch")
126126
filename = file.get("filename", "")
@@ -129,13 +129,11 @@ def _process_files_generator(
129129
filename.split(".")[-1] not in parser.EXCLUDED_FILETYPES
130130
and patch_details is not None
131131
):
132-
temp_prompt = (
133-
combined_diff_data
134-
+ f"\n---->\nFile Name: {filename}\nPatch Details: \n{patch_details}"
135-
)
132+
133+
diff_parts.append(f"\n---->\nFile Name: {filename}\nPatch Details: \n{patch_details}")
136134

137-
if available_tokens - self.provider.get_token_count(temp_prompt) > 0:
138-
combined_diff_data = temp_prompt
135+
if available_tokens - self.provider.get_token_count("".join(diff_parts)) > 0:
136+
combined_diff_data = "".join(diff_parts)
139137
continue
140138

141139
yield self._process_file_chunk(
@@ -144,13 +142,11 @@ def _process_files_generator(
144142
pull_request_desc,
145143
user,
146144
)
147-
combined_diff_data = (
148-
f"\n---->\nFile Name: {filename}\nPatch Details: {patch_details}"
149-
)
145+
diff_parts = [f"\n---->\nFile Name: {filename}\nPatch Details: {patch_details}"]
150146

151-
if combined_diff_data:
147+
if diff_parts:
152148
yield self._process_file_chunk(
153-
combined_diff_data,
149+
"".join(diff_parts),
154150
pull_request_title,
155151
pull_request_desc,
156152
user,

kaizen/helpers/parser.py

+55-46
Original file line numberDiff line numberDiff line change
@@ -1,64 +1,71 @@
11
import json
22
import re
3+
import os
34

45
EXCLUDED_FILETYPES = [
56
# Compiled output
6-
"class",
7-
"o",
8-
"obj",
9-
"exe",
10-
"dll",
11-
"pyc",
12-
"pyo",
7+
"class", "o", "obj", "exe", "dll", "pyc", "pyo",
138
# Package manager files
149
"lock", # Covers package-lock.json, yarn.lock, Gemfile.lock, composer.lock
1510
# IDE configurations
16-
"idea",
17-
"vscode",
18-
"project",
19-
"classpath",
20-
# Build artifacts and dependencies
21-
"node_modules",
22-
"vendor",
23-
"target",
24-
"build",
11+
"idea", "vscode", "project", "classpath",
2512
# Binary and large files
26-
"zip",
27-
"tar",
28-
"gz",
29-
"rar",
30-
"pdf",
31-
"doc",
32-
"docx",
33-
"xls",
34-
"xlsx",
35-
"jpg",
36-
"jpeg",
37-
"png",
38-
"gif",
39-
"bmp",
40-
"ico",
41-
"mp3",
42-
"mp4",
43-
"avi",
44-
"mov",
13+
"zip", "tar", "gz", "rar", "pdf", "doc", "docx", "xls", "xlsx",
14+
"jpg", "jpeg", "png", "gif", "bmp", "ico", "mp3", "mp4", "avi", "mov",
4515
# Log files
4616
"log",
4717
# Database files
48-
"db",
49-
"sqlite",
18+
"db", "sqlite",
5019
# Temporary files
51-
"tmp",
52-
"temp",
20+
"tmp", "temp",
5321
# OS-specific files
54-
"DS_Store",
55-
"Thumbs.db",
22+
"DS_Store", "Thumbs.db",
5623
# Configuration files
57-
"gitignore",
58-
"dockerignore",
59-
# Add any other specific extensions or directory names you want to exclude
24+
"gitignore", "dockerignore",
25+
# Add any other specific extensions you want to exclude
6026
]
6127

28+
# List of folders to exclude
29+
EXCLUDED_FOLDERS = [
30+
"node_modules",
31+
"dist",
32+
"out",
33+
"vendor",
34+
"target",
35+
"build",
36+
"__pycache__",
37+
".git",
38+
# Add any other folders you want to exclude
39+
]
40+
41+
42+
def should_ignore_file(filepath):
43+
"""
44+
Check if a file should be ignored based on its path, name, or extension.
45+
46+
:param filepath: The full path of the file to check
47+
:return: True if the file should be ignored, False otherwise
48+
"""
49+
# Get the file name and extension
50+
filename = os.path.basename(filepath)
51+
_, extension = os.path.splitext(filename)
52+
extension = extension.lstrip('.') # Remove the leading dot
53+
54+
# Check if the file is in an excluded folder
55+
for folder in EXCLUDED_FOLDERS:
56+
if folder in filepath.split(os.path.sep):
57+
return True
58+
59+
# Check if the file extension is in the excluded list
60+
if extension.lower() in EXCLUDED_FILETYPES:
61+
return True
62+
63+
# Check for specific filenames
64+
if filename in ["package-lock.json", "yarn.lock", "Gemfile.lock", "composer.lock", ".DS_Store", "Thumbs.db"]:
65+
return True
66+
67+
return False
68+
6269

6370
def extract_json(text):
6471
# Find the start and end positions of the JSON data
@@ -166,11 +173,13 @@ def format_change(old_num, new_num, change_type, content, ignore_deletions=False
166173

167174

168175
def patch_to_combined_chunks(patch_text, ignore_deletions=False):
176+
if not patch_text:
177+
return ""
169178
patch_text = patch_text.replace("\r\n", "\n")
170179
lines = patch_text.splitlines(keepends=True)
171180
changes = []
172-
removal_line_num = 0
173-
addition_line_num = 0
181+
removal_line_num = 1
182+
addition_line_num = 1
174183
is_diff = False
175184
removed_hunk = False
176185
current_file_name = ""

kaizen/llms/prompts/code_review_prompts.py

+22-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,22 @@
11
CODE_REVIEW_SYSTEM_PROMPT = """
2-
As a senior software developer reviewing code submissions, provide thorough, constructive feedback and suggestions for improvements. Consider best practices, error handling, performance, readability, and maintainability. Offer objective and respectful reviews that help developers enhance their skills and code quality. Use your expertise to provide comprehensive feedback without asking clarifying questions.
2+
You are an expert code reviewer. Provide thorough, constructive feedback on code submissions, considering best practices, error handling, performance, readability, maintainability, and security. Be objective, respectful, and focus on helping developers improve their code quality.
3+
4+
Review Process:
5+
1. Analyze provided context and diff to understand changes.
6+
2. Evaluate changes for correctness, performance, security, and maintainability.
7+
3. Identify improvement opportunities, considering best practices and patterns.
8+
4. Assess error handling and potential edge cases.
9+
5. Consider testing implications and documentation needs.
10+
6. Analyze impact on dependencies and overall system.
11+
7. Identify potential technical debt and future-proofing concerns.
12+
8. Summarize findings and prioritize feedback.
13+
14+
Provide specific feedback with accurate references to the provided content.
15+
Be thorough and strict in your review, but don't ask clarifying questions.
16+
17+
Focus on new and modified code while considering existing context.
18+
Provide specific feedback with accurate file paths and line numbers.
19+
Be thorough and strict, but don't ask clarifying questions.
320
"""
421

522
CODE_REVIEW_PROMPT = """
@@ -21,6 +38,7 @@
2138
"end_line": <ENDING_LINE_NUMBER>,
2239
"sentiment": "positive|negative|neutral",
2340
"severity": <1_TO_10>,
41+
"line_prefix": "CONTEXT|REMOVED|UPDATED",
2442
"type": "general|performance|security|refactoring|best_practices|duplication|maintainability|scalability|error_handling|resource_management|concurrency|dependencies|compatibility|accessibility|localization|efficiency|readability|naming",
2543
"technical_debt": "<POTENTIAL_FUTURE_ISSUES>|empty",
2644
"alternatives": "<ALTERNATIVE_SOLUTIONS>|empty"
@@ -73,6 +91,7 @@
7391
7. Identify code duplication and suggest refactoring.
7492
8. Prioritize issues based on impact. Be strict; don't let issues slide.
7593
9. If no issues found: {{"review": []}}
94+
10. Make sure suggested_code and current_code return full functional block of code. We will use that to overwrite the current_code.
7695
7796
## Additional Considerations:
7897
- Language-specific best practices and common pitfalls
@@ -106,6 +125,7 @@
106125
"end_line": <ENDING_LINE_NUMBER>,
107126
"sentiment": "positive|negative|neutral",
108127
"severity": <1_TO_10>,
128+
"line_prefix": "CONTEXT|REMOVED|UPDATED",
109129
"type": "general|performance|security|refactoring|best_practices|duplication|maintainability|scalability|error_handling|resource_management|concurrency|dependencies|compatibility|accessibility|localization|efficiency|readability|naming",
110130
"technical_debt": "<POTENTIAL_FUTURE_ISSUES>|empty",
111131
"alternatives": "<ALTERNATIVE_SOLUTIONS>|empty"
@@ -160,6 +180,7 @@
160180
7. Identify code duplication and suggest refactoring.
161181
8. Prioritize issues based on impact. Be strict; don't let issues slide.
162182
9. If no issues found: {{"review": []}}
183+
10. Make sure suggested_code and current_code return full functional block of code. We will use that to overwrite the current_code.
163184
164185
## Additional Considerations:
165186
- Language-specific best practices and common pitfalls

kaizen/llms/provider.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import logging
1010
from collections import defaultdict
1111

12-
DEFAULT_MAX_TOKENS = 4000
12+
DEFAULT_MAX_TOKENS = 8000
1313

1414

1515
def set_all_loggers_to_ERROR():
@@ -188,7 +188,7 @@ def raw_chat_completion(
188188
self.model = response["model"]
189189
return response, response["usage"]
190190

191-
@retry(max_attempts=3, delay=1)
191+
@retry(max_attempts=3, delay=0.1)
192192
def chat_completion_with_json(
193193
self,
194194
prompt,
@@ -207,7 +207,7 @@ def chat_completion_with_json(
207207
response = extract_json(response)
208208
return response, usage
209209

210-
@retry(max_attempts=3, delay=1)
210+
@retry(max_attempts=3, delay=0.1)
211211
def chat_completion_with_retry(
212212
self,
213213
prompt,

0 commit comments

Comments
 (0)