Skip to content

Commit 488ac75

Browse files
authoredMar 7, 2025··
Define TR caching comms in ATD (#353)
To make things more concrete about the interface we want with the backend. test plan: make - [x] I ran `make setup && make` to update the generated code after editing a `.atd` file (TODO: have a CI check) - [x] I made sure we're still backward compatible with old versions of the CLI. For example, the Semgrep backend need to still be able to *consume* data generated by Semgrep 1.50.0. See https://atd.readthedocs.io/en/latest/atdgen-tutorial.html#smooth-protocol-upgrades Note that the types related to the semgrep-core JSON output or the semgrep-core RPC do not need to be backward compatible!
1 parent dee4b8b commit 488ac75

7 files changed

+11272
-9959
lines changed
 

‎semgrep_output_v1.atd

+69-1
Original file line numberDiff line numberDiff line change
@@ -555,6 +555,8 @@ type sca_match = {
555555
(* Note that in addition to "reachable" there are also the notions of
556556
* "vulnerable" and "exploitable".
557557
* coupling: see also SCA_match.ml
558+
* TODO? have a Direct of xxx and Transitive of sca_transitive_match_kind?
559+
* better so can be reused in other types such as tr_cache_result?
558560
*)
559561
type sca_match_kind = [
560562
(* This is used for "parity" or "upgrade-only" rules. transitivity
@@ -1839,6 +1841,68 @@ type scan_config = {
18391841
?ci_config_from_cloud: ci_config_from_cloud option;
18401842
}
18411843

1844+
(* ------------------------------------------- *)
1845+
(* Transitive reachabilitiy (TR) caching comms *)
1846+
(* ------------------------------------------- *)
1847+
(* We want essentially to cache semgrep computation on third party packages
1848+
* to quickly know (rule_id x package_version) -> sca_transitive_match_kind
1849+
* to avoid downloading and recomputing each time the same thing.
1850+
*)
1851+
1852+
(* The "key".
1853+
* The rule_id and resolved_url should form a valid key for our TR cache
1854+
* database table. Indeed, semgrep should always return the same result when
1855+
* using the same rule and same resolved_url package. The content at the
1856+
* URL should hopefully not change (we could md5sum it just in case) and
1857+
* the content of the rule_id should also not change (could md5sum it maybe too).
1858+
* I've added tr_version below just in case we want to invalidate past
1859+
* cached entries (e.g., the semgrep engine itself changed enough that
1860+
* some past cached results might be wrong and should be recomputed)
1861+
*)
1862+
type tr_cache_key = {
1863+
rule_id: rule_id;
1864+
(* this can be the checksum of the content of the rule (JSON or YAML form) *)
1865+
rule_version: string;
1866+
(* does not have to match the Semgrep CLI version; can be bumped only
1867+
* when we think the match should be recomputed
1868+
* TODO: to be set in Transitive_reachability.ml tr_version constant
1869+
*)
1870+
engine_version: int;
1871+
(* ex: http://some-website/hello-world.0.1.2.tgz like in found_dependency
1872+
* 'resolved_url' field, but could be anything to describe a particular
1873+
* package. We could rely on https://github.com/package-url/purl-spec
1874+
*)
1875+
package_url: string;
1876+
(* extra key just in case (e.g., "prod" vs "dev") *)
1877+
extra: string;
1878+
}
1879+
1880+
(* The "value" *)
1881+
type tr_cache_match_result = {
1882+
(* alt: cache just sca_match? or sca_match_kind? or even define a separate
1883+
* sca_transitive_match type? which would be smaller than storing
1884+
* the whole set of matches
1885+
* alt: cache the whole cli_output? (which also contains the errors)
1886+
*)
1887+
matches: cli_match list;
1888+
}
1889+
1890+
(* Sent by the CLI to the POST /api/???? *)
1891+
type tr_query_cache_request = {
1892+
entries: tr_cache_key list;
1893+
}
1894+
1895+
(* Response by the backend the the POST /api/???? *)
1896+
type tr_query_cache_response = {
1897+
cached: (tr_cache_key * tr_cache_match_result) list;
1898+
}
1899+
1900+
(* Sent by the CLI to the POST /api/??? *)
1901+
type tr_add_cache_request = {
1902+
new_entries: (tr_cache_key * tr_cache_match_result) list;
1903+
}
1904+
(* TODO: tr_add_cache_response: string result (Ok | Error) *)
1905+
18421906
(* ----------------------------- *)
18431907
(* TODO a better CI config from cloud *)
18441908
(* ----------------------------- *)
@@ -2407,6 +2471,10 @@ type resolution_result = [
24072471
| ResolutionError of resolution_error_kind list
24082472
]
24092473

2474+
(* ----------------------------- *)
2475+
(* SCA transitive reachability *)
2476+
(* ----------------------------- *)
2477+
24102478
type transitive_finding = {
24112479
(* the important part is the sca_match in core_match_extra that
24122480
* we need to adjust and especially the sca_match_kind.
@@ -2424,7 +2492,7 @@ type transitive_reachability_filter_params = {
24242492
}
24252493

24262494
(* ----------------------------- *)
2427-
(* SCA part 4: Symbol analysis *)
2495+
(* Symbol analysis *)
24282496
(* ----------------------------- *)
24292497

24302498
(* "Symbol analysis" is about determining the third-party functions which

‎semgrep_output_v1.jsonschema

+69
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎semgrep_output_v1.proto

+25-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)
Please sign in to comment.