-
Notifications
You must be signed in to change notification settings - Fork 598
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add support for analysis of source code/scripted languages #1080
base: master
Are you sure you want to change the base?
Changes from 1 commit
bbd3f70
8173397
428f6bc
a6d7ba2
80bf78b
cf3dc7e
9d7f575
3d4b4ec
eca7ead
5fd953f
1f79db9
a58bc0b
5ddb8ba
31e2fb9
5bf3f18
a4529fc
d5de9a1
6c10458
9bd9824
2594849
619ed94
5e23802
5d83e8d
9570523
7c5e6e3
1e0326a
ca1939f
d7ab2db
5cfbecc
26cc1bc
2a9e76f
672ca71
ca426ca
fd80277
d0c4acb
ad31d83
e52a9b3
b27713b
b2df2b0
a0379a6
eeecb63
cebc5e1
d7dcc94
32dc5ff
5e85a6e
614900f
bb08181
1fd9d4a
7ba978f
25cf09b
e693573
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
…/html: aspx.
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,12 @@ | ||
import re | ||
from typing import Dict, List, Tuple, Union, Iterator | ||
from collections import defaultdict | ||
from dataclasses import dataclass | ||
from typing import List, Tuple, Iterator, Optional | ||
|
||
from tree_sitter import Node, Tree, Parser | ||
|
||
import capa.features.extractors.ts.sig | ||
import capa.features.extractors.ts.build | ||
from capa.features.address import FileOffsetRangeAddress | ||
from capa.features.extractors.script import LANG_CS, LANG_JS | ||
from capa.features.extractors.script import LANG_CS, LANG_JS, LANG_TEM, LANG_HTML | ||
from capa.features.extractors.ts.query import ( | ||
QueryBinding, | ||
HTMLQueryBinding, | ||
|
@@ -21,18 +19,14 @@ | |
class TreeSitterBaseEngine: | ||
buf: bytes | ||
language: str | ||
path: str | ||
query: QueryBinding | ||
tree: Tree | ||
|
||
def __init__(self, language: str, path: str): | ||
def __init__(self, language: str, buf: bytes): | ||
capa.features.extractors.ts.build.ts_build() | ||
self.language = language | ||
self.query = QueryBindingFactory.from_language(language) | ||
self.import_signatures = capa.features.extractors.ts.sig.load_import_signatures(language) | ||
self.path = path | ||
with open(self.path, "rb") as f: | ||
self.buf = f.read() | ||
self.buf = buf | ||
self.tree = self.parse() | ||
|
||
def parse(self) -> Tree: | ||
|
@@ -46,19 +40,27 @@ def get_byte_range(self, node: Node) -> bytes: | |
def get_range(self, node: Node) -> str: | ||
return self.get_byte_range(node).decode() | ||
adamstorek marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def get_address(self, node: Node): | ||
def get_address(self, node: Node) -> FileOffsetRangeAddress: | ||
return FileOffsetRangeAddress(node.start_byte, node.end_byte) | ||
|
||
def get_default_address(self): | ||
def get_default_address(self) -> FileOffsetRangeAddress: | ||
return self.get_address(self.tree.root_node) | ||
|
||
|
||
class TreeSitterExtractorEngine(TreeSitterBaseEngine): | ||
query: ScriptQueryBinding | ||
import_signatures: set | ||
buf_offset: int | ||
namespaces: set[str] | ||
|
||
def __init__(self, language: str, buf: bytes, buf_offset: int = 0, additional_namespaces: set[str] = None): | ||
super().__init__(language, buf) | ||
self.buf_offset = buf_offset | ||
self.import_signatures = capa.features.extractors.ts.sig.load_import_signatures(language) | ||
self.namespaces = additional_namespaces if additional_namespaces is not None else set() | ||
|
||
def __init__(self, language: str, path: str): | ||
super().__init__(language, path) | ||
def get_address(self, node: Node) -> FileOffsetRangeAddress: | ||
return FileOffsetRangeAddress(self.buf_offset + node.start_byte, self.buf_offset + node.end_byte) | ||
|
||
def get_new_objects(self, node: Node) -> List[Tuple[Node, str]]: | ||
return self.query.new_object.captures(node) | ||
|
@@ -73,13 +75,13 @@ def get_new_object_ids(self, node: Node) -> Iterator[Node]: | |
# TODO: move this elsewhere, does not fit this class | ||
def get_import_names(self, node: Node) -> Iterator[Tuple[Node, str]]: | ||
join_names = capa.features.extractors.ts.sig.get_name_joiner(self.language) | ||
namespaces = set([self.get_range(ns_node) for ns_node, _ in self.get_namespaces()]) | ||
self.namespaces = self.namespaces.union(set([self.get_range(ns_node) for ns_node, _ in self.get_namespaces()])) | ||
for obj_node in self.get_new_object_ids(node): | ||
obj_name = self.get_range(obj_node) | ||
if obj_name in self.import_signatures: | ||
yield (obj_node, obj_name) | ||
continue | ||
for namespace in namespaces: | ||
for namespace in self.namespaces: | ||
obj_name = join_names(namespace, obj_name) | ||
if obj_name in self.import_signatures: | ||
yield (obj_node, obj_name) | ||
|
@@ -107,13 +109,13 @@ def get_function_call_ids(self, node: Node) -> Iterator[Node]: | |
# TODO: move this elsewhere, does not fit this class | ||
def get_function_names(self, node: Node) -> Iterator[Tuple[Node, str]]: | ||
join_names = capa.features.extractors.ts.sig.get_name_joiner(self.language) | ||
namespaces = set([self.get_range(ns_node) for ns_node, _ in self.get_namespaces()]) | ||
self.namespaces = self.namespaces.union(set([self.get_range(ns_node) for ns_node, _ in self.get_namespaces()])) | ||
for fn_node in self.get_function_call_ids(node): | ||
fn_name = self.get_range(fn_node) | ||
if fn_name in self.import_signatures: | ||
yield (fn_node, fn_name) | ||
continue | ||
for namespace in namespaces: | ||
for namespace in self.namespaces: | ||
fn_name = join_names(namespace, fn_name) | ||
if fn_name in self.import_signatures: | ||
yield (fn_node, fn_name) | ||
|
@@ -131,65 +133,73 @@ def get_global_statements(self) -> List[Tuple[Node, str]]: | |
return self.query.global_statement.captures(self.tree.root_node) | ||
|
||
|
||
@dataclass | ||
class ASPXPseudoNode: | ||
start_byte: int | ||
end_byte: int | ||
|
||
|
||
class TreeSitterTemplateEngine(TreeSitterBaseEngine): | ||
query: TemplateQueryBinding | ||
|
||
def __init__(self, language: str, path: str): | ||
super().__init__(language, path) | ||
def __init__(self, buf: bytes): | ||
super().__init__(LANG_TEM, buf) | ||
|
||
def get_code_sections(self) -> List[Tuple[Node, str]]: | ||
return self.query.code.captures(self.tree.root_node) | ||
|
||
def get_parsed_code_sections(self) -> Iterator[TreeSitterExtractorEngine]: | ||
template_namespaces = set(name for _, name in self.get_template_namespaces()) | ||
for node, _ in self.get_code_sections(): | ||
yield TreeSitterExtractorEngine( | ||
self.identify_language(), self.get_byte_range(node), node.start_byte, template_namespaces | ||
) | ||
|
||
def get_content_sections(self) -> List[Tuple[Node, str]]: | ||
return self.query.content.captures(self.tree.root_node) | ||
|
||
def get_template_namespaces(self) -> Iterator[ASPXPseudoNode]: | ||
def identify_language(self) -> str: | ||
for node, _ in self.get_code_sections(): | ||
if self.is_c_sharp(node): | ||
return LANG_CS | ||
return LANG_JS | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what if it is neither? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe there is no easy way to remedy this. From what I understand about templates in general is that the syntax is determined by the templating engine. In other words, there is easy way to detect from an unknown template which templating engine is being used (asp.net (and if so, what language), razor, ejs, erb, mako, jinja2, django, cheetah, go's html/template etc., not to mention each has their own syntax (some might use regular programming languages like C# to embed server logic, some might just contain very rudimentary placeholders/logic. Here I am assuming that we only support EJS and C# in ASPX at the moment as embedded templates. This is because Tree-sitter embedded templates parser can only parse EJS and ERB (and we are not interested in embedded Ruby at the moment as far as I'm concerned). What's more, the default language for ASPX is VB, therefore if anyone wants to use C#, they need to include a @ Page directive with a Language attribute (see: https://docs.microsoft.com/en-us/previous-versions/aspnet/k33801s3(v=vs.100), https://docs.microsoft.com/en-us/previous-versions/dotnet/netframework-4.0/ydy4x04a(v=vs.100)?redirectedfrom=MSDN, https://docs.microsoft.com/en-us/previous-versions/aspnet/fbdt8kk7(v=vs.100)). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we still assume that it's JS whenever it's not CS? |
||
|
||
def get_template_namespaces(self) -> Iterator[Tuple[Node, str]]: | ||
for node, _ in self.get_code_sections(): | ||
if self.is_aspx_import_directive: | ||
ns = self.get_aspx_namespace(node) | ||
if ns is not None: | ||
yield ns | ||
namespace = self.get_aspx_namespace(node) | ||
if namespace is not None: | ||
yield node, namespace | ||
|
||
def is_aspx(self, node: Node) -> bool: | ||
return self.get_byte_range(node).startswith(b"@") | ||
def is_c_sharp(self, node: Node) -> bool: | ||
return len(re.findall(r'@ .*Page Language\s*=\s*"C#".*'.encode(), self.get_byte_range(node))) > 0 | ||
|
||
def is_aspx_import_directive(self, node: Node) -> bool: | ||
return self.get_byte_range(node).startswith(b"@ Import namespace=") | ||
|
||
def get_aspx_namespace(self, node: Node) -> Union[ASPXPseudoNode, None]: | ||
def get_aspx_namespace(self, node: Node) -> Optional[str]: | ||
match = re.search(r'@ Import namespace="(.*?)"'.encode(), self.get_byte_range(node)) | ||
if match is None: | ||
return None | ||
return ASPXPseudoNode(node.start_byte + match.span()[0], node.start_byte + match.span()[1]) | ||
return match.group().decode() if match is not None else None | ||
|
||
|
||
class TreeSitterHTMLEngine(TreeSitterBaseEngine): | ||
query: HTMLQueryBinding | ||
namespaces: set[str] | ||
|
||
def __init__(self, language: str, path: str): | ||
super().__init__(language, path) | ||
def __init__(self, buf: bytes, additional_namespaces: set[str] = None): | ||
adamstorek marked this conversation as resolved.
Show resolved
Hide resolved
|
||
super().__init__(LANG_HTML, buf) | ||
self.namespaces = additional_namespaces if additional_namespaces is not None else set() | ||
adamstorek marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def get_scripts(self) -> List[Tuple[Node, str]]: | ||
return self.query.script_element.captures(self.tree.root_node) | ||
|
||
def get_attributes(self, node: Node) -> List[Tuple[Node, str]]: | ||
return self.query.attribute.captures(self.tree.root_node) | ||
|
||
def get_code_sections_by_language(self) -> Dict[str, List[Node]]: | ||
code_sections = defaultdict(list) | ||
def get_code_sections(self) -> Iterator[Node]: | ||
for script_node, _ in self.get_scripts(): | ||
for attribute_node, _ in self.get_attributes(script_node): | ||
script_language = self.identify_script_language(attribute_node) | ||
code_sections[script_language].append(attribute_node) | ||
return code_sections | ||
yield attribute_node | ||
|
||
def get_parsed_code_sections(self) -> Iterator[TreeSitterExtractorEngine]: | ||
for node in self.get_code_sections(): | ||
yield TreeSitterExtractorEngine(self.identify_language(node), self.get_byte_range(node), node.start_byte) | ||
|
||
def identify_script_language(self, node: Node) -> str: | ||
def identify_language(self, node: Node) -> str: | ||
if self.is_server_side_c_sharp(node): | ||
return LANG_CS | ||
return LANG_JS | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hm lets find a better place for this initialization
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
global in this file is a good place to start