add colno/end_colno and end_linno properties to JSON5Token. restructu…

…re JSONObject to store keys and values separately This change removes `KeyValuePair` as a Node object. Instead, the underlying data model will store a list of keys and list of values in separate attributes. A new attribute for `JSONOBject` `key_value_pairs` which will be a named tuple (typing.NamedTuple) that provides the key and value attributes. This change brings more congruity with how the stdlib `ast` module handles Python dictionaries, to which Python users may be more accustomed. It also maintains, by interface, the same structure for JSON5 members (https://spec.json5.org/#prod-JSON5MemberList) with the new key_value_pairs attribute (although it itself is not a Node).
spyoungtech · Aug 3, 2023 · 52164f2 · 52164f2
1 parent d11ae38
commit 52164f2
Show file tree

Hide file tree

Showing 6 changed files with 110 additions and 54 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,6 +5,7 @@ repos:
     -   id: check-yaml
     -   id: end-of-file-fixer
     -   id: trailing-whitespace
+        exclude: ^(tests/.*)
 -   repo: https://github.com/asottile/reorder-python-imports
     rev: v3.10.0
     hooks:
@@ -44,3 +45,4 @@ repos:
         args:
           - "--ignore"
           - "E501,E704,E301,W503,F405,F811,F821,F403,"
+        exclude: ^(tests/.*)
diff --git a/json5/dumper.py b/json5/dumper.py
@@ -200,8 +200,9 @@ def json_object_to_json(self, node: JSONObject) -> Any:
         self.env.write('{')
         if node.leading_wsc:
             self.process_leading_wsc(node)
-        num_pairs = len(node.key_value_pairs)
-        for index, kvp in enumerate(node.key_value_pairs, start=1):
+        key_value_pairs = node.key_value_pairs
+        num_pairs = len(key_value_pairs)
+        for index, kvp in enumerate(key_value_pairs, start=1):
             self.dump(kvp.key)
             self.env.write(':')
             self.dump(kvp.value)

diff --git a/json5/model.py b/json5/model.py
@@ -5,6 +5,7 @@
 from collections import deque
 from typing import Any
 from typing import Literal
+from typing import NamedTuple
 
 from .tokenizer import JSON5Token
 
@@ -35,6 +36,11 @@
 ]
 
 
+class KeyValuePair(NamedTuple):
+    key: Key
+    value: Value
+
+
 def walk(root: Node) -> typing.Generator[Node, None, None]:
     todo = deque([root])
     while todo:
@@ -76,25 +82,13 @@ def __init__(self, tok: JSON5Token | None = None, end_tok: JSON5Token | None = N
     def col_offset(self) -> int | None:
         if self._tok is None:
             return None
-        return self._tok.index
+        return self._tok.colno
 
     @property
     def end_col_offset(self) -> int | None:
         if self._end_tok is None:
             return None
-
-        # TODO fix these cases in the tokenizer
-        if isinstance(self, (DoubleQuotedString, SingleQuotedString)):
-            if '\n' in self.raw_value:
-                return len(self.raw_value.rsplit('\n', 1)[-1])
-            else:
-                return self._end_tok.end
-        elif isinstance(self, BlockComment):
-            if '\n' in self.value:
-                return len(self.value.rsplit('\n', 1)[-1])
-            else:
-                return self._end_tok.end
-        return self._end_tok.end
+        return self._end_tok.end_colno
 
     @property
     def lineno(self) -> int | None:
@@ -107,14 +101,14 @@ def end_lineno(self) -> int | None:
         if self._end_tok is None:
             return None
         r = self._end_tok.lineno
-        # TODO fix these cases in the tokenizer
-        if isinstance(self, (DoubleQuotedString, SingleQuotedString)):
-            return r + self.raw_value.count('\n')
-        elif isinstance(self, BlockComment):
-            return r + self.value.count('\n')
+        # # TODO fix these cases in the tokenizer
+        # if isinstance(self, (DoubleQuotedString, SingleQuotedString)):
+        #     return r + self.raw_value.count('\n')
+        # elif isinstance(self, BlockComment):
+        #     return r + self.value.count('\n')
         return r
 
-    def __repr__(self) -> str:
+    def __str__(self) -> str:
         rep = (
             f"{self.__class__.__name__}("
             + ", ".join(
@@ -157,16 +151,26 @@ def __init__(
         tok: JSON5Token | None = None,
         end_tok: JSON5Token | None = None,
     ):
-        kvps = list(key_value_pairs)
-        for kvp in kvps:
-            assert isinstance(kvp, KeyValuePair), f"Expected key value pair, got {type(kvp)}"
+        keys: list[Key] = []
+        values: list[Value] = []
+        for key, value in key_value_pairs:
+            assert isinstance(key, Key)
+            assert isinstance(value, Value)
+            keys.append(key)
+            values.append(value)
+        assert len(keys) == len(values)
+        self.keys: list[Key] = keys
+        self.values: list[Value] = values
         assert leading_wsc is None or all(isinstance(item, str) or isinstance(item, Comment) for item in leading_wsc)
-        self.key_value_pairs: list[KeyValuePair] = kvps
         self.trailing_comma: TrailingComma | None = trailing_comma
         self.leading_wsc: list[str | Comment] = leading_wsc or []
 
         super().__init__(tok=tok, end_tok=end_tok)
 
+    @property
+    def key_value_pairs(self) -> list[KeyValuePair]:
+        return list(KeyValuePair(key, value) for key, value in zip(self.keys, self.values))
+
 
 class JSONArray(Value):
     def __init__(
@@ -188,16 +192,6 @@ def __init__(
         super().__init__(tok=tok, end_tok=end_tok)
 
 
-class KeyValuePair(Node):
-    def __init__(self, key: Key, value: Value, tok: JSON5Token | None = None, end_tok: JSON5Token | None = None):
-        assert isinstance(key, Key)
-        assert isinstance(value, Value)
-        self.key: Key = key
-        self.value: Value = value
-
-        super().__init__(tok=tok, end_tok=end_tok)
-
-
 class Identifier(Key):
     def __init__(
         self, name: str, raw_value: str | None = None, tok: JSON5Token | None = None, end_tok: JSON5Token | None = None
@@ -301,10 +295,6 @@ def const(self) -> Literal['NaN']:
 
 
 class String(Value, Key):
-    ...
-
-
-class DoubleQuotedString(String):
     def __init__(
         self, characters: str, raw_value: str, tok: JSON5Token | None = None, end_tok: JSON5Token | None = None
     ):
@@ -316,16 +306,12 @@ def __init__(
         super().__init__(tok=tok, end_tok=tok)
 
 
-class SingleQuotedString(String):
-    def __init__(
-        self, characters: str, raw_value: str, tok: JSON5Token | None = None, end_tok: JSON5Token | None = None
-    ):
-        assert isinstance(raw_value, str)
-        assert isinstance(characters, str)
-        self.characters: str = characters
-        self.raw_value: str = raw_value
+class DoubleQuotedString(String):
+    ...
 
-        super().__init__(tok=tok, end_tok=tok)
+
+class SingleQuotedString(String):
+    ...
 
 
 class BooleanLiteral(Value):

diff --git a/json5/parser.py b/json5/parser.py
@@ -266,7 +266,7 @@ def first_key_value_pair(self, p: T_FirstKeyValuePairProduction) -> KeyValuePair
             value.wsc_before.append(wsc)
         for wsc in p.wsc2:
             value.wsc_after.append(wsc)
-        return KeyValuePair(key=p.key, value=p.value, tok=key._tok, end_tok=value._end_tok)
+        return KeyValuePair(key=p.key, value=p.value)
 
     @_('object_delimiter_seen COMMA { wsc } [ first_key_value_pair ]')
     def subsequent_key_value_pair(self, p: SubsequentKeyValuePairProduction) -> KeyValuePair | TrailingComma:
@@ -340,14 +340,14 @@ def seen_RBRACE(self, p: Any) -> None:
     @_('seen_LBRACE LBRACE { wsc } [ key_value_pairs ] seen_RBRACE RBRACE')
     def json_object(self, p: T_JsonObjectProduction) -> JSONObject:
         if not p.key_value_pairs:
-            node = JSONObject(leading_wsc=list(p.wsc or []), tok=p._slice[0], end_tok=p._slice[5])
+            node = JSONObject(leading_wsc=list(p.wsc or []), tok=p._slice[1], end_tok=p._slice[5])
         else:
             kvps, trailing_comma = p.key_value_pairs
             node = JSONObject(
                 *kvps,
                 trailing_comma=trailing_comma,
                 leading_wsc=list(p.wsc or []),
-                tok=p._slice[0],
+                tok=p._slice[1],
                 end_tok=p._slice[5],
             )
 

diff --git a/json5/tokenizer.py b/json5/tokenizer.py
@@ -29,6 +29,19 @@ def __init__(self, tok: Token, doc: str):
         self.doc: str = doc
         self.end: int = tok.end
 
+    @property
+    def colno(self) -> int:
+        line_start_index = self.doc.rfind('\n', 0, self.index) + 1
+        return self.index - line_start_index
+
+    @property
+    def end_colno(self) -> int:
+        return self.colno + self.end - self.index
+
+    @property
+    def end_lineno(self) -> int:
+        return self.lineno + self.value.count('\n')
+
     __slots__ = ('type', 'value', 'lineno', 'index', 'doc', 'end')
 
     def __str__(self) -> str:
@@ -90,8 +103,15 @@ def tokenize(self, text: str, lineno: int = 1, index: int = 0) -> Generator[JSON
     COLON = r"\:"
     COMMA = r"\,"
 
-    DOUBLE_QUOTE_STRING = r'"(?:[^"\\]|\\.)*"'
-    SINGLE_QUOTE_STRING = r"'(?:[^'\\]|\\.)*'"
+    @_(r'"(?:[^"\\]|\\.)*"')
+    def DOUBLE_QUOTE_STRING(self, tok: JSON5Token) -> JSON5Token:
+        self.lineno += tok.value.count('\n')
+        return tok
+
+    @_(r"'(?:[^'\\]|\\.)*'")
+    def SINGLE_QUOTE_STRING(self, tok: JSON5Token) -> JSON5Token:
+        self.lineno += tok.value.count('\n')
+        return tok
 
     LINE_COMMENT = r"//[^\n]*"
 

diff --git a/tests/test_model.py b/tests/test_model.py
@@ -0,0 +1,47 @@
+import ast
+
+import pytest
+
+import json5.loader
+import json5.model
+
+TEST_TEXT = '''\
+{
+    "string_on_same_line":     "string on same line",
+          "multiline_dq_string": "this line has a \
+continuation",  
+      "leadingDecimalPoint": .8675309  ,    
+      "andTrailing":     8675309.,  
+    "trailingComma": 'in objects',   
+        "backwardsCompatible": "with JSON",
+}
+'''
+
+model = json5.loads(TEST_TEXT, loader=json5.loader.ModelLoader())
+tree = ast.parse(TEST_TEXT)
+ast_nodes = [
+    node for node in list(ast.walk(tree)) if not isinstance(node, (ast.Expr, ast.Load, ast.Module, ast.UnaryOp))
+]
+json5_nodes = [
+    node
+    for node in list(json5.model.walk(model))
+    if not isinstance(node, (json5.model.TrailingComma, json5.model.JSONText))
+]
+
+assert len(ast_nodes) == len(json5_nodes)
+
+
+@pytest.mark.parametrize('ast_node, json5_node', list(zip(ast_nodes, json5_nodes)))
+@pytest.mark.parametrize(
+    'attr_name',
+    [
+        'col_offset',
+        'end_col_offset',
+        'lineno',
+        'end_lineno',
+    ],
+)
+def test_node_attribute_accuracy(attr_name: str, ast_node, json5_node):
+    assert getattr(json5_node, attr_name) == getattr(
+        ast_node, attr_name
+    ), f'{attr_name} did not match {ast_node!r}, {json5_node!r}'