| 26 | 26 | |
| 27 | 27 | |
| 28 | 28 | def parse(input_string: str) -> SgfTree: |
| 29 | | pass |
| 29 | """ |
| 30 | Parse an SGF string and return a tree structure of properties. |
| 31 | |
| 32 | Args: |
| 33 | input_string: A string in SGF format |
| 34 | |
| 35 | Returns: |
| 36 | SgfTree: The parsed SGF tree |
| 37 | |
| 38 | Raises: |
| 39 | ValueError: If the input is invalid according to SGF format rules |
| 40 | """ |
| 41 | # Edge Case: Empty input |
| 42 | if not input_string: |
| 43 | raise ValueError("tree missing") |
| 44 | |
| 45 | # Edge Case: Missing opening parenthesis |
| 46 | if not input_string.startswith('('): |
| 47 | raise ValueError("tree missing") |
| 48 | |
| 49 | # Edge Case: Missing closing parenthesis |
| 50 | if not input_string.endswith(')'): |
| 51 | raise ValueError("tree missing") |
| 52 | |
| 53 | # Remove the outer parentheses |
| 54 | content = input_string[1:-1] |
| 55 | |
| 56 | # Edge Case: Empty tree with no nodes |
| 57 | if not content: |
| 58 | raise ValueError("tree with no nodes") |
| 59 | |
| 60 | # Edge Case: Missing semicolon after opening parenthesis |
| 61 | if not content.startswith(';'): |
| 62 | raise ValueError("tree missing") |
| 63 | |
| 64 | return _parse_tree(content) |
| 65 | |
| 66 | |
| 67 | def _parse_tree(content: str) -> SgfTree: |
| 68 | """ |
| 69 | Parse the content of a tree (without the outer parentheses). |
| 70 | |
| 71 | Args: |
| 72 | content: The content of the tree |
| 73 | |
| 74 | Returns: |
| 75 | SgfTree: The parsed tree |
| 76 | """ |
| 77 | # Skip the initial semicolon |
| 78 | index = 1 |
| 79 | |
| 80 | # Parse properties of the current node |
| 81 | properties, index = _parse_properties(content, index) |
| 82 | |
| 83 | # Parse children |
| 84 | children = [] |
| 85 | while index < len(content): |
| 86 | if content[index] == '(': # Start of a child tree (variation) |
| 87 | child_tree, consumed = _parse_subtree(content[index:]) |
| 88 | children.append(child_tree) |
| 89 | index += consumed |
| 90 | elif content[index] == ';': # Start of a sequential child node |
| 91 | child_tree, consumed = _parse_sequential_node(content[index:]) |
| 92 | children.append(child_tree) |
| 93 | index += consumed |
| 94 | else: |
| 95 | # This should not happen in a valid SGF |
| 96 | break |
| 97 | |
| 98 | return SgfTree(properties=properties, children=children) |
| 99 | |
| 100 | |
| 101 | def _parse_subtree(content: str) -> tuple[SgfTree, int]: |
| 102 | """ |
| 103 | Parse a subtree (including the outer parentheses). |
| 104 | |
| 105 | Args: |
| 106 | content: The content starting with '(' |
| 107 | |
| 108 | Returns: |
| 109 | tuple: (parsed tree, number of characters consumed) |
| 110 | """ |
| 111 | # Find matching closing parenthesis |
| 112 | balance = 0 |
| 113 | end_index = 0 |
| 114 | for i, char in enumerate(content): |
| 115 | if char == '(': |
| 116 | # Edge Case: Properly handle nested parentheses |
| 117 | balance += 1 |
| 118 | elif char == ')': |
| 119 | balance -= 1 |
| 120 | if balance == 0: |
| 121 | end_index = i |
| 122 | break |
| 123 | |
| 124 | # Edge Case: Unmatched parentheses |
| 125 | if balance != 0: |
| 126 | raise ValueError("tree missing") |
| 127 | |
| 128 | # Parse the content inside the parentheses |
| 129 | inner_content = content[1:end_index] |
| 130 | |
| 131 | # Edge Case: Empty subtree |
| 132 | if not inner_content: |
| 133 | raise ValueError("tree with no nodes") |
| 134 | |
| 135 | # Edge Case: Missing semicolon |
| 136 | if not inner_content.startswith(';'): |
| 137 | raise ValueError("tree missing") |
| 138 | |
| 139 | tree = _parse_tree(inner_content) |
| 140 | return tree, end_index + 1 |
| 141 | |
| 142 | |
| 143 | def _parse_sequential_node(content: str) -> tuple[SgfTree, int]: |
| 144 | """ |
| 145 | Parse a sequential node (starting with semicolon, not in parentheses). |
| 146 | |
| 147 | Args: |
| 148 | content: The content starting with ';' |
| 149 | |
| 150 | Returns: |
| 151 | tuple: (parsed tree, number of characters consumed) |
| 152 | """ |
| 153 | # Parse properties of this node |
| 154 | properties, index = _parse_properties(content, 1) # Start after the semicolon |
| 155 | |
| 156 | # Parse children of this node |
| 157 | children = [] |
| 158 | while index < len(content): |
| 159 | if content[index] == '(': # Start of a child tree (variation) |
| 160 | child_tree, consumed = _parse_subtree(content[index:]) |
| 161 | children.append(child_tree) |
| 162 | index += consumed |
| 163 | elif content[index] == ';': # Start of a sequential child node |
| 164 | child_tree, consumed = _parse_sequential_node(content[index:]) |
| 165 | children.append(child_tree) |
| 166 | index += consumed |
| 167 | else: |
| 168 | # End of this node |
| 169 | break |
| 170 | |
| 171 | tree = SgfTree(properties=properties, children=children) |
| 172 | return tree, index |
| 173 | |
| 174 | |
| 175 | def _parse_properties(content: str, start_index: int) -> tuple[dict, int]: |
| 176 | """ |
| 177 | Parse properties from the content starting at start_index. |
| 178 | |
| 179 | Args: |
| 180 | content: The content to parse |
| 181 | start_index: The index to start parsing from |
| 182 | |
| 183 | Returns: |
| 184 | tuple: (properties dictionary, next index to parse) |
| 185 | """ |
| 186 | properties = {} |
| 187 | index = start_index |
| 188 | |
| 189 | while index < len(content) and content[index] not in '();': |
| 190 | # Parse key |
| 191 | key_start = index |
| 192 | while index < len(content) and content[index].isalpha(): |
| 193 | index += 1 |
| 194 | |
| 195 | # Edge Case: Empty key (no alphabetic characters found) |
| 196 | if key_start == index: |
| 197 | break |
| 198 | |
| 199 | |
| 200 | # Edge Case: Property key must be uppercase |
| 201 | if not content[key_start:index].isupper(): |
| 202 | raise ValueError("property must be in uppercase") |
| 203 | |
| 204 | key = content[key_start:index] |
| 205 | |
| 206 | # Edge Case: Property without values |
| 207 | if index >= len(content) or content[index] != '[': |
| 208 | raise ValueError("properties without delimiter") |
| 209 | |
| 210 | # Parse values |
| 211 | values = [] |
| 212 | while index < len(content) and content[index] == '[': |
| 213 | index += 1 # Skip opening bracket |
| 214 | value_start = index |
| 215 | |
| 216 | # Parse value, handling escapes |
| 217 | value_chars = [] |
| 218 | while index < len(content): |
| 219 | if content[index] == '\\': |
| 220 | # Add the backslash to the value |
| 221 | value_chars.append('\\') |
| 222 | index += 1 |
| 223 | if index < len(content): |
| 224 | # Add the escaped character to the value |
| 225 | value_chars.append(content[index]) |
| 226 | index += 1 |
| 227 | elif content[index] == ']': |
| 228 | break |
| 229 | else: |
| 230 | value_chars.append(content[index]) |
| 231 | index += 1 |
| 232 | |
| 233 | # Set the value and update the index |
| 234 | value = ''.join(value_chars) |
| 235 | value = _unescape_value(value) |
| 236 | values.append(value) |
| 237 | index += 1 # Skip closing bracket |
| 238 | |
| 239 | |
| 240 | # Edge Case: Property with no values |
| 241 | if not values: |
| 242 | raise ValueError("properties without delimiter") |
| 243 | |
| 244 | |
| 245 | properties[key] = values |
| 246 | |
| 247 | |
| 248 | return properties, index |
| 249 | |
| 250 | |
| 251 | def _unescape_value(value: str) -> str: |
| 252 | """ |
| 253 | Unescape an SGF value according to SGF text type rules. |
| 254 | |
| 255 | Args: |
| 256 | value: The escaped value |
| 257 | |
| 258 | Returns: |
| 259 | str: The unescaped value |
| 260 | """ |
| 261 | result = [] |
| 262 | i = 0 |
| 263 | while i < len(value): |
| 264 | if value[i] == '\\': |
| 265 | i += 1 |
| 266 | if i < len(value): |
| 267 | # Check if next character is newline |
| 268 | if value[i] == '\n': |
| 269 | # Edge Case: Backslash followed by newline removes the newline |
| 270 | pass # Don't add anything |
| 271 | else: |
| 272 | # Edge Case: Backslash followed by any other character adds that character |
| 273 | result.append(value[i]) |
| 274 | i += 1 |
| 275 | else: |
| 276 | # Edge Case: Trailing backslash |
| 277 | result.append('\\') |
| 278 | else: |
| 279 | char = value[i] |
| 280 | # Edge Case: Convert all whitespace (except newline) to space |
| 281 | if char != '\n' and char.isspace(): |
| 282 | result.append(' ') |
| 283 | else: |
| 284 | result.append(char) |
| 285 | i += 1 |
| 286 | |
| 287 | return ''.join(result) |
| 288 | # Handled Edge Cases: Empty input, missing opening parenthesis, missing closing parenthesis, empty tree with no nodes, missing semicolon after opening parenthesis, properly handle nested parentheses, unmatched parentheses, empty subtree, missing semicolon, property key must be uppercase, property without values, unmatched bracket, property with no values, backslash followed by newline removes the newline, backslash followed by any other character adds that character, trailing backslash, convert all whitespace (except newline) to space |
| 289 | # Handled Edge Cases: Property key must be uppercase, property without values, unmatched bracket, property with no values |
| 290 | # Handled Edge Cases: Backslash followed by newline removes the newline, backslash followed by any other character adds that character, trailing backslash, convert all whitespace (except newline) to space |