관리-도구

편집 파일: _parser.py

import string
from types import MappingProxyType
from typing import (
    TYPE_CHECKING,
    Any,
    Callable,
    Dict,
    FrozenSet,
    Iterable,
    Optional,
    TextIO,
    Tuple,
)

from pip._vendor.tomli._re import (
    RE_BIN,
    RE_DATETIME,
    RE_HEX,
    RE_LOCALTIME,
    RE_NUMBER,
    RE_OCT,
    match_to_datetime,
    match_to_localtime,
    match_to_number,
)

if TYPE_CHECKING:
    from re import Pattern

ASCII_CTRL = frozenset(chr(i) for i in range(32)) | frozenset(chr(127))

# Neither of these sets include quotation mark or backslash. They are
# currently handled as separate cases in the parser functions.
ILLEGAL_BASIC_STR_CHARS = ASCII_CTRL - frozenset("\t")
ILLEGAL_MULTILINE_BASIC_STR_CHARS = ASCII_CTRL - frozenset("\t\n\r")

ILLEGAL_LITERAL_STR_CHARS = ILLEGAL_BASIC_STR_CHARS
ILLEGAL_MULTILINE_LITERAL_STR_CHARS = ASCII_CTRL - frozenset("\t\n")

ILLEGAL_COMMENT_CHARS = ILLEGAL_BASIC_STR_CHARS

TOML_WS = frozenset(" \t")
TOML_WS_AND_NEWLINE = TOML_WS | frozenset("\n")
BARE_KEY_CHARS = frozenset(string.ascii_letters + string.digits + "-_")
KEY_INITIAL_CHARS = BARE_KEY_CHARS | frozenset("\"'")

BASIC_STR_ESCAPE_REPLACEMENTS = MappingProxyType(
    {
        "\\b": "\u0008",  # backspace
        "\\t": "\u0009",  # tab
        "\\n": "\u000A",  # linefeed
        "\\f": "\u000C",  # form feed
        "\\r": "\u000D",  # carriage return
        '\\"': "\u0022",  # quote
        "\\\\": "\u005C",  # backslash
    }
)

# Type annotations
ParseFloat = Callable[[str], Any]
Key = Tuple[str, ...]
Pos = int

class TOMLDecodeError(ValueError):
    """An error raised if a document is not valid TOML."""

def load(fp: TextIO, *, parse_float: ParseFloat = float) -> Dict[str, Any]:
    """Parse TOML from a file object."""
    s = fp.read()
    return loads(s, parse_float=parse_float)

def loads(s: str, *, parse_float: ParseFloat = float) -> Dict[str, Any]:  # noqa: C901
    """Parse TOML from a string."""

# The spec allows converting "\r\n" to "\n", even in string
    # literals. Let's do so to simplify parsing.
    src = s.replace("\r\n", "\n")
    pos = 0
    state = State()

# Parse one statement at a time
    # (typically means one line in TOML source)
    while True:
        # 1. Skip line leading whitespace
        pos = skip_chars(src, pos, TOML_WS)

# 2. Parse rules. Expect one of the following:
        #    - end of file
        #    - end of line
        #    - comment
        #    - key/value pair
        #    - append dict to list (and move to its namespace)
        #    - create dict (and move to its namespace)
        # Skip trailing whitespace when applicable.
        try:
            char = src[pos]
        except IndexError:
            break
        if char == "\n":
            pos += 1
            continue
        if char in KEY_INITIAL_CHARS:
            pos = key_value_rule(src, pos, state, parse_float)
            pos = skip_chars(src, pos, TOML_WS)
        elif char == "[":
            try:
                second_char: Optional[str] = src[pos + 1]
            except IndexError:
                second_char = None
            if second_char == "[":
                pos = create_list_rule(src, pos, state)
            else:
                pos = create_dict_rule(src, pos, state)
            pos = skip_chars(src, pos, TOML_WS)
        elif char != "#":
            raise suffixed_err(src, pos, "Invalid statement")

# 3. Skip comment
        pos = skip_comment(src, pos)

# 4. Expect end of line or end of file
        try:
            char = src[pos]
        except IndexError:
            break
        if char != "\n":
            raise suffixed_err(
                src, pos, "Expected newline or end of document after a statement"
            )
        pos += 1

return state.out.dict

class State:
    def __init__(self) -> None:
        # Mutable, read-only
        self.out = NestedDict()
        self.flags = Flags()

# Immutable, read and write
        self.header_namespace: Key = ()

class Flags:
    """Flags that map to parsed keys/namespaces."""

# Marks an immutable namespace (inline array or inline table).
    FROZEN = 0
    # Marks a nest that has been explicitly created and can no longer
    # be opened using the "[table]" syntax.
    EXPLICIT_NEST = 1

def __init__(self) -> None:
        self._flags: Dict[str, dict] = {}

def unset_all(self, key: Key) -> None:
        cont = self._flags
        for k in key[:-1]:
            if k not in cont:
                return
            cont = cont[k]["nested"]
        cont.pop(key[-1], None)

def set_for_relative_key(self, head_key: Key, rel_key: Key, flag: int) -> None:
        cont = self._flags
        for k in head_key:
            if k not in cont:
                cont[k] = {"flags": set(), "recursive_flags": set(), "nested": {}}
            cont = cont[k]["nested"]
        for k in rel_key:
            if k in cont:
                cont[k]["flags"].add(flag)
            else:
                cont[k] = {"flags": {flag}, "recursive_flags": set(), "nested": {}}
            cont = cont[k]["nested"]

def set(self, key: Key, flag: int, *, recursive: bool) -> None:  # noqa: A003
        cont = self._flags
        key_parent, key_stem = key[:-1], key[-1]
        for k in key_parent:
            if k not in cont:
                cont[k] = {"flags": set(), "recursive_flags": set(), "nested": {}}
            cont = cont[k]["nested"]
        if key_stem not in cont:
            cont[key_stem] = {"flags": set(), "recursive_flags": set(), "nested": {}}
        cont[key_stem]["recursive_flags" if recursive else "flags"].add(flag)

def is_(self, key: Key, flag: int) -> bool:
        if not key:
            return False  # document root has no flags
        cont = self._flags
        for k in key[:-1]:
            if k not in cont:
                return False
            inner_cont = cont[k]
            if flag in inner_cont["recursive_flags"]:
                return True
            cont = inner_cont["nested"]
        key_stem = key[-1]
        if key_stem in cont:
            cont = cont[key_stem]
            return flag in cont["flags"] or flag in cont["recursive_flags"]
        return False

class NestedDict:
    def __init__(self) -> None:
        # The parsed content of the TOML document
        self.dict: Dict[str, Any] = {}

def get_or_create_nest(
        self,
        key: Key,
        *,
        access_lists: bool = True,
    ) -> dict:
        cont: Any = self.dict
        for k in key:
            if k not in cont:
                cont[k] = {}
            cont = cont[k]
            if access_lists and isinstance(cont, list):
                cont = cont[-1]
            if not isinstance(cont, dict):
                raise KeyError("There is no nest behind this key")
        return cont

def append_nest_to_list(self, key: Key) -> None:
        cont = self.get_or_create_nest(key[:-1])
        last_key = key[-1]
        if last_key in cont:
            list_ = cont[last_key]
            if not isinstance(list_, list):
                raise KeyError("An object other than list found behind this key")
            list_.append({})
        else:
            cont[last_key] = [{}]

def skip_chars(src: str, pos: Pos, chars: Iterable[str]) -> Pos:
    try:
        while src[pos] in chars:
            pos += 1
    except IndexError:
        pass
    return pos

def skip_until(
    src: str,
    pos: Pos,
    expect: str,
    *,
    error_on: FrozenSet[str],
    error_on_eof: bool,
) -> Pos:
    try:
        new_pos = src.index(expect, pos)
    except ValueError:
        new_pos = len(src)
        if error_on_eof:
            raise suffixed_err(src, new_pos, f'Expected "{expect!r}"')

bad_chars = error_on.intersection(src[pos:new_pos])
    if bad_chars:
        bad_char = next(iter(bad_chars))
        bad_pos = src.index(bad_char, pos)
        raise suffixed_err(src, bad_pos, f'Found invalid character "{bad_char!r}"')
    return new_pos

def skip_comment(src: str, pos: Pos) -> Pos:
    try:
        char: Optional[str] = src[pos]
    except IndexError:
        char = None
    if char == "#":
        return skip_until(
            src, pos + 1, "\n", error_on=ILLEGAL_COMMENT_CHARS, error_on_eof=False
        )
    return pos

def skip_comments_and_array_ws(src: str, pos: Pos) -> Pos:
    while True:
        pos_before_skip = pos
        pos = skip_chars(src, pos, TOML_WS_AND_NEWLINE)
        pos = skip_comment(src, pos)
        if pos == pos_before_skip:
            return pos

def create_dict_rule(src: str, pos: Pos, state: State) -> Pos:
    pos += 1  # Skip "["
    pos = skip_chars(src, pos, TOML_WS)
    pos, key = parse_key(src, pos)

if state.flags.is_(key, Flags.EXPLICIT_NEST) or state.flags.is_(key, Flags.FROZEN):
        raise suffixed_err(src, pos, f"Can not declare {key} twice")
    state.flags.set(key, Flags.EXPLICIT_NEST, recursive=False)
    try:
        state.out.get_or_create_nest(key)
    except KeyError:
        raise suffixed_err(src, pos, "Can not overwrite a value")
    state.header_namespace = key

if src[pos : pos + 1] != "]":
        raise suffixed_err(src, pos, 'Expected "]" at the end of a table declaration')
    return pos + 1

def create_list_rule(src: str, pos: Pos, state: State) -> Pos:
    pos += 2  # Skip "[["
    pos = skip_chars(src, pos, TOML_WS)
    pos, key = parse_key(src, pos)

if state.flags.is_(key, Flags.FROZEN):
        raise suffixed_err(src, pos, f"Can not mutate immutable namespace {key}")
    # Free the namespace now that it points to another empty list item...
    state.flags.unset_all(key)
    # ...but this key precisely is still prohibited from table declaration
    state.flags.set(key, Flags.EXPLICIT_NEST, recursive=False)
    try:
        state.out.append_nest_to_list(key)
    except KeyError:
        raise suffixed_err(src, pos, "Can not overwrite a value")
    state.header_namespace = key

end_marker = src[pos : pos + 2]
    if end_marker != "]]":
        raise suffixed_err(
            src,
            pos,
            f'Found "{end_marker!r}" at the end of an array declaration.'
            ' Expected "]]"',
        )
    return pos + 2

def key_value_rule(src: str, pos: Pos, state: State, parse_float: ParseFloat) -> Pos:
    pos, key, value = parse_key_value_pair(src, pos, parse_float)
    key_parent, key_stem = key[:-1], key[-1]
    abs_key_parent = state.header_namespace + key_parent

if state.flags.is_(abs_key_parent, Flags.FROZEN):
        raise suffixed_err(
            src, pos, f"Can not mutate immutable namespace {abs_key_parent}"
        )
    # Containers in the relative path can't be opened with the table syntax after this
    state.flags.set_for_relative_key(state.header_namespace, key, Flags.EXPLICIT_NEST)
    try:
        nest = state.out.get_or_create_nest(abs_key_parent)
    except KeyError:
        raise suffixed_err(src, pos, "Can not overwrite a value")
    if key_stem in nest:
        raise suffixed_err(src, pos, "Can not overwrite a value")
    # Mark inline table and array namespaces recursively immutable
    if isinstance(value, (dict, list)):
        abs_key = state.header_namespace + key
        state.flags.set(abs_key, Flags.FROZEN, recursive=True)
    nest[key_stem] = value
    return pos

def parse_key_value_pair(
    src: str, pos: Pos, parse_float: ParseFloat
) -> Tuple[Pos, Key, Any]:
    pos, key = parse_key(src, pos)
    try:
        char: Optional[str] = src[pos]
    except IndexError:
        char = None
    if char != "=":
        raise suffixed_err(src, pos, 'Expected "=" after a key in a key/value pair')
    pos += 1
    pos = skip_chars(src, pos, TOML_WS)
    pos, value = parse_value(src, pos, parse_float)
    return pos, key, value

def parse_key(src: str, pos: Pos) -> Tuple[Pos, Key]:
    pos, key_part = parse_key_part(src, pos)
    key = [key_part]
    pos = skip_chars(src, pos, TOML_WS)
    while True:
        try:
            char: Optional[str] = src[pos]
        except IndexError:
            char = None
        if char != ".":
            return pos, tuple(key)
        pos += 1
        pos = skip_chars(src, pos, TOML_WS)
        pos, key_part = parse_key_part(src, pos)
        key.append(key_part)
        pos = skip_chars(src, pos, TOML_WS)

def parse_key_part(src: str, pos: Pos) -> Tuple[Pos, str]:
    try:
        char: Optional[str] = src[pos]
    except IndexError:
        char = None
    if char in BARE_KEY_CHARS:
        start_pos = pos
        pos = skip_chars(src, pos, BARE_KEY_CHARS)
        return pos, src[start_pos:pos]
    if char == "'":
        return parse_literal_str(src, pos)
    if char == '"':
        return parse_one_line_basic_str(src, pos)
    raise suffixed_err(src, pos, "Invalid initial character for a key part")

def parse_one_line_basic_str(src: str, pos: Pos) -> Tuple[Pos, str]:
    pos += 1
    return parse_basic_str(src, pos, multiline=False)

def parse_array(src: str, pos: Pos, parse_float: ParseFloat) -> Tuple[Pos, list]:
    pos += 1
    array: list = []

pos = skip_comments_and_array_ws(src, pos)
    if src[pos : pos + 1] == "]":
        return pos + 1, array
    while True:
        pos, val = parse_value(src, pos, parse_float)
        array.append(val)
        pos = skip_comments_and_array_ws(src, pos)

c = src[pos : pos + 1]
        if c == "]":
            return pos + 1, array
        if c != ",":
            raise suffixed_err(src, pos, "Unclosed array")
        pos += 1

pos = skip_comments_and_array_ws(src, pos)
        if src[pos : pos + 1] == "]":
            return pos + 1, array

def parse_inline_table(src: str, pos: Pos, parse_float: ParseFloat) -> Tuple[Pos, dict]:
    pos += 1
    nested_dict = NestedDict()
    flags = Flags()

pos = skip_chars(src, pos, TOML_WS)
    if src[pos : pos + 1] == "}":
        return pos + 1, nested_dict.dict
    while True:
        pos, key, value = parse_key_value_pair(src, pos, parse_float)
        key_parent, key_stem = key[:-1], key[-1]
        if flags.is_(key, Flags.FROZEN):
            raise suffixed_err(src, pos, f"Can not mutate immutable namespace {key}")
        try:
            nest = nested_dict.get_or_create_nest(key_parent, access_lists=False)
        except KeyError:
            raise suffixed_err(src, pos, "Can not overwrite a value")
        if key_stem in nest:
            raise suffixed_err(src, pos, f'Duplicate inline table key "{key_stem}"')
        nest[key_stem] = value
        pos = skip_chars(src, pos, TOML_WS)
        c = src[pos : pos + 1]
        if c == "}":
            return pos + 1, nested_dict.dict
        if c != ",":
            raise suffixed_err(src, pos, "Unclosed inline table")
        if isinstance(value, (dict, list)):
            flags.set(key, Flags.FROZEN, recursive=True)
        pos += 1
        pos = skip_chars(src, pos, TOML_WS)

def parse_basic_str_escape(
    src: str, pos: Pos, *, multiline: bool = False
) -> Tuple[Pos, str]:
    escape_id = src[pos : pos + 2]
    pos += 2
    if multiline and escape_id in {"\\ ", "\\\t", "\\\n"}:
        # Skip whitespace until next non-whitespace character or end of
        # the doc. Error if non-whitespace is found before newline.
        if escape_id != "\\\n":
            pos = skip_chars(src, pos, TOML_WS)
            char = src[pos : pos + 1]
            if not char:
                return pos, ""
            if char != "\n":
                raise suffixed_err(src, pos, 'Unescaped "\\" in a string')
            pos += 1
        pos = skip_chars(src, pos, TOML_WS_AND_NEWLINE)
        return pos, ""
    if escape_id == "\\u":
        return parse_hex_char(src, pos, 4)
    if escape_id == "\\U":
        return parse_hex_char(src, pos, 8)
    try:
        return pos, BASIC_STR_ESCAPE_REPLACEMENTS[escape_id]
    except KeyError:
        if len(escape_id) != 2:
            raise suffixed_err(src, pos, "Unterminated string")
        raise suffixed_err(src, pos, 'Unescaped "\\" in a string')

def parse_basic_str_escape_multiline(src: str, pos: Pos) -> Tuple[Pos, str]:
    return parse_basic_str_escape(src, pos, multiline=True)

def parse_hex_char(src: str, pos: Pos, hex_len: int) -> Tuple[Pos, str]:
    hex_str = src[pos : pos + hex_len]
    if len(hex_str) != hex_len or any(c not in string.hexdigits for c in hex_str):
        raise suffixed_err(src, pos, "Invalid hex value")
    pos += hex_len
    hex_int = int(hex_str, 16)
    if not is_unicode_scalar_value(hex_int):
        raise suffixed_err(src, pos, "Escaped character is not a Unicode scalar value")
    return pos, chr(hex_int)

def parse_literal_str(src: str, pos: Pos) -> Tuple[Pos, str]:
    pos += 1  # Skip starting apostrophe
    start_pos = pos
    pos = skip_until(
        src, pos, "'", error_on=ILLEGAL_LITERAL_STR_CHARS, error_on_eof=True
    )
    return pos + 1, src[start_pos:pos]  # Skip ending apostrophe

def parse_multiline_str(src: str, pos: Pos, *, literal: bool) -> Tuple[Pos, str]:
    pos += 3
    if src[pos : pos + 1] == "\n":
        pos += 1

if literal:
        delim = "'"
        end_pos = skip_until(
            src,
            pos,
            "'''",
            error_on=ILLEGAL_MULTILINE_LITERAL_STR_CHARS,
            error_on_eof=True,
        )
        result = src[pos:end_pos]
        pos = end_pos + 3
    else:
        delim = '"'
        pos, result = parse_basic_str(src, pos, multiline=True)

# Add at maximum two extra apostrophes/quotes if the end sequence
    # is 4 or 5 chars long instead of just 3.
    if src[pos : pos + 1] != delim:
        return pos, result
    pos += 1
    if src[pos : pos + 1] != delim:
        return pos, result + delim
    pos += 1
    return pos, result + (delim * 2)

def parse_basic_str(src: str, pos: Pos, *, multiline: bool) -> Tuple[Pos, str]:
    if multiline:
        error_on = ILLEGAL_MULTILINE_BASIC_STR_CHARS
        parse_escapes = parse_basic_str_escape_multiline
    else:
        error_on = ILLEGAL_BASIC_STR_CHARS
        parse_escapes = parse_basic_str_escape
    result = ""
    start_pos = pos
    while True:
        try:
            char = src[pos]
        except IndexError:
            raise suffixed_err(src, pos, "Unterminated string")
        if char == '"':
            if not multiline:
                return pos + 1, result + src[start_pos:pos]
            if src[pos + 1 : pos + 3] == '""':
                return pos + 3, result + src[start_pos:pos]
            pos += 1
            continue
        if char == "\\":
            result += src[start_pos:pos]
            pos, parsed_escape = parse_escapes(src, pos)
            result += parsed_escape
            start_pos = pos
            continue
        if char in error_on:
            raise suffixed_err(src, pos, f'Illegal character "{char!r}"')
        pos += 1

def parse_regex(src: str, pos: Pos, regex: "Pattern") -> Tuple[Pos, str]:
    match = regex.match(src, pos)
    if not match:
        raise suffixed_err(src, pos, "Unexpected sequence")
    return match.end(), match.group()

def parse_value(  # noqa: C901
    src: str, pos: Pos, parse_float: ParseFloat
) -> Tuple[Pos, Any]:
    try:
        char: Optional[str] = src[pos]
    except IndexError:
        char = None

# Basic strings
    if char == '"':
        if src[pos + 1 : pos + 3] == '""':
            return parse_multiline_str(src, pos, literal=False)
        return parse_one_line_basic_str(src, pos)

# Literal strings
    if char == "'":
        if src[pos + 1 : pos + 3] == "''":
            return parse_multiline_str(src, pos, literal=True)
        return parse_literal_str(src, pos)

# Booleans
    if char == "t":
        if src[pos + 1 : pos + 4] == "rue":
            return pos + 4, True
    if char == "f":
        if src[pos + 1 : pos + 5] == "alse":
            return pos + 5, False

# Dates and times
    datetime_match = RE_DATETIME.match(src, pos)
    if datetime_match:
        try:
            datetime_obj = match_to_datetime(datetime_match)
        except ValueError:
            raise suffixed_err(src, pos, "Invalid date or datetime")
        return datetime_match.end(), datetime_obj
    localtime_match = RE_LOCALTIME.match(src, pos)
    if localtime_match:
        return localtime_match.end(), match_to_localtime(localtime_match)

# Non-decimal integers
    if char == "0":
        second_char = src[pos + 1 : pos + 2]
        if second_char == "x":
            pos, hex_str = parse_regex(src, pos + 2, RE_HEX)
            return pos, int(hex_str, 16)
        if second_char == "o":
            pos, oct_str = parse_regex(src, pos + 2, RE_OCT)
            return pos, int(oct_str, 8)
        if second_char == "b":
            pos, bin_str = parse_regex(src, pos + 2, RE_BIN)
            return pos, int(bin_str, 2)

# Decimal integers and "normal" floats.
    # The regex will greedily match any type starting with a decimal
    # char, so needs to be located after handling of non-decimal ints,
    # and dates and times.
    number_match = RE_NUMBER.match(src, pos)
    if number_match:
        return number_match.end(), match_to_number(number_match, parse_float)

# Arrays
    if char == "[":
        return parse_array(src, pos, parse_float)

# Inline tables
    if char == "{":
        return parse_inline_table(src, pos, parse_float)

# Special floats
    first_three = src[pos : pos + 3]
    if first_three in {"inf", "nan"}:
        return pos + 3, parse_float(first_three)
    first_four = src[pos : pos + 4]
    if first_four in {"-inf", "+inf", "-nan", "+nan"}:
        return pos + 4, parse_float(first_four)

raise suffixed_err(src, pos, "Invalid value")

def suffixed_err(src: str, pos: Pos, msg: str) -> TOMLDecodeError:
    """Return a `TOMLDecodeError` where error message is suffixed with
    coordinates in source."""

def coord_repr(src: str, pos: Pos) -> str:
        if pos >= len(src):
            return "end of document"
        line = src.count("\n", 0, pos) + 1
        if line == 1:
            column = pos + 1
        else:
            column = pos - src.rindex("\n", 0, pos)
        return f"line {line}, column {column}"

return TOMLDecodeError(f"{msg} (at {coord_repr(src, pos)})")

def is_unicode_scalar_value(codepoint: int) -> bool:
    return (0 <= codepoint <= 55295) or (57344 <= codepoint <= 1114111)