| 26 | 26 | |
| 27 | 27 | |
| 28 | 28 | def parse(input_string: str) -> SgfTree: |
| 29 | | pass |
| 29 | """ |
| 30 | Parse an SGF string and return a tree structure of properties. |
| 31 | |
| 32 | Args: |
| 33 | input_string: A string in SGF format |
| 34 | |
| 35 | Returns: |
| 36 | SgfTree: The parsed SGF tree |
| 37 | |
| 38 | Raises: |
| 39 | ValueError: If the input is invalid according to SGF format rules |
| 40 | """ |
| 41 | # Edge Case: Empty input |
| 42 | if not input_string: |
| 43 | raise ValueError("tree missing") |
| 44 | |
| 45 | # Edge Case: Input doesn't start with '(' |
| 46 | if not input_string.startswith('('): |
| 47 | raise ValueError("tree missing") |
| 48 | |
| 49 | # Edge Case: Input doesn't end with ')' |
| 50 | if not input_string.endswith(')'): |
| 51 | raise ValueError("tree missing") |
| 52 | |
| 53 | # Remove the outer parentheses |
| 54 | content = input_string[1:-1] |
| 55 | |
| 56 | # Edge Case: Empty tree with no nodes |
| 57 | if not content: |
| 58 | raise ValueError("tree with no nodes") |
| 59 | |
| 60 | # Parse the tree |
| 61 | tree, _ = _parse_tree(content, 0) |
| 62 | return tree |
| 63 | |
| 64 | |
| 65 | def _parse_tree(content: str, index: int) -> tuple[SgfTree, int]: |
| 66 | """ |
| 67 | Parse a tree from the content string starting at index. |
| 68 | |
| 69 | Args: |
| 70 | content: The SGF content string |
| 71 | index: The starting index to parse from |
| 72 | |
| 73 | Returns: |
| 74 | tuple: (SgfTree, new_index) The parsed tree and the index after parsing |
| 75 | """ |
| 76 | # Edge Case: No semicolon to start node |
| 77 | if index >= len(content) or content[index] != ';': |
| 78 | raise ValueError("tree missing") |
| 79 | |
| 80 | index += 1 # Skip the semicolon |
| 81 | |
| 82 | # Parse properties of the current node |
| 83 | properties = {} |
| 84 | while index < len(content) and content[index] not in '();': |
| 85 | # Parse key |
| 86 | key_start = index |
| 87 | while index < len(content) and content[index].isalpha() and content[index].isupper(): |
| 88 | index += 1 |
| 89 | |
| 90 | # Edge Case: Key is not uppercase |
| 91 | if index == key_start: |
| 92 | raise ValueError("property must be in uppercase") |
| 93 | |
| 94 | key = content[key_start:index] |
| 95 | |
| 96 | # Edge Case: No values for the property |
| 97 | if index >= len(content) or content[index] != '[': |
| 98 | raise ValueError("properties without delimiter") |
| 99 | |
| 100 | # Parse values |
| 101 | values = [] |
| 102 | while index < len(content) and content[index] == '[': |
| 103 | index += 1 # Skip '[' |
| 104 | value_start = index |
| 105 | |
| 106 | # Parse value, handling escapes |
| 107 | value_chars = [] |
| 108 | while index < len(content) and content[index] != ']': |
| 109 | if content[index] == '\\': |
| 110 | index += 1 |
| 111 | if index >= len(content): |
| 112 | raise ValueError("properties without delimiter") |
| 113 | |
| 114 | char = content[index] |
| 115 | # According to SGF specification, when backslash is followed by newline, |
| 116 | # both backslash and newline are removed |
| 117 | if char == '\n': |
| 118 | # Skip both backslash and newline - they are both removed |
| 119 | pass |
| 120 | else: |
| 121 | # For other escaped characters, add the backslash and character |
| 122 | # The text processing will handle the rest |
| 123 | value_chars.append('\\') |
| 124 | value_chars.append(char) |
| 125 | else: |
| 126 | value_chars.append(content[index]) |
| 127 | index += 1 |
| 128 | |
| 129 | # Edge Case: Unclosed bracket |
| 130 | if index >= len(content) or content[index] != ']': |
| 131 | raise ValueError("properties without delimiter") |
| 132 | |
| 133 | # Process the value according to SGF text rules |
| 134 | value = _process_sgf_text(''.join(value_chars)) |
| 135 | values.append(value) |
| 136 | index += 1 # Skip ']' |
| 137 | |
| 138 | properties[key] = values |
| 139 | |
| 140 | # Parse children |
| 141 | children = [] |
| 142 | |
| 143 | # Handle children in parentheses (variations) |
| 144 | while index < len(content) and content[index] == '(': |
| 145 | child, index = _parse_tree(content, index+1) # Skip '(' |
| 146 | |
| 147 | # Find the matching ')' |
| 148 | paren_count = 1 |
| 149 | i = index |
| 150 | while i < len(content) and paren_count > 0: |
| 151 | if content[i] == '(': |
| 152 | paren_count += 1 |
| 153 | elif content[i] == ')': |
| 154 | paren_count -= 1 |
| 155 | i += 1 |
| 156 | |
| 157 | # Skip to after the closing parenthesis |
| 158 | if paren_count == 0: |
| 159 | index = i |
| 160 | |
| 161 | children.append(child) |
| 162 | |
| 163 | # Handle sequential nodes (without parentheses) |
| 164 | while index < len(content) and content[index] == ';': |
| 165 | child, index = _parse_tree(content, index) |
| 166 | children.append(child) |
| 167 | |
| 168 | return SgfTree(properties, children), index |
| 169 | |
| 170 | |
| 171 | def _process_sgf_text(text: str) -> str: |
| 172 | """ |
| 173 | Process SGF text according to SGF specification. |
| 174 | |
| 175 | Args: |
| 176 | text: Raw text from SGF property value |
| 177 | |
| 178 | Returns: |
| 179 | str: Processed text |
| 180 | """ |
| 181 | result = [] |
| 182 | i = 0 |
| 183 | while i < len(text): |
| 184 | if text[i] == '\\': |
| 185 | i += 1 |
| 186 | if i >= len(text): |
| 187 | result.append('\\') |
| 188 | break |
| 189 | |
| 190 | # Handle escaped characters according to SGF rules |
| 191 | char = text[i] |
| 192 | # If the character after backslash is a newline, remove both backslash and newline |
| 193 | if char == '\n': |
| 194 | # Skip the newline (it's removed) |
| 195 | pass |
| 196 | # For other whitespace characters after backslash, convert to space |
| 197 | elif char != '\n' and char.isspace(): |
| 198 | result.append(' ') |
| 199 | # For non-whitespace characters after backslash, insert as-is |
| 200 | else: |
| 201 | result.append(char) |
| 202 | else: |
| 203 | # Convert whitespace (except newlines) to spaces |
| 204 | if text[i] != '\n' and text[i].isspace(): |
| 205 | result.append(' ') |
| 206 | else: |
| 207 | result.append(text[i]) |
| 208 | i += 1 |
| 209 | |
| 210 | return ''.join(result) |
| 211 | |
| 212 | # Handled Edge Cases: Empty input, Input doesn't start with '(', Input doesn't end with ')', Empty tree with no nodes, No semicolon to start node, Key is not uppercase, No values for the property, Unclosed bracket" |