| 26 | 26 | |
| 27 | 27 | |
| 28 | 28 | def parse(input_string: str) -> SgfTree: |
| 29 | | pass |
| 29 | """ |
| 30 | Parse an SGF string and return a tree structure of properties. |
| 31 | |
| 32 | Args: |
| 33 | input_string: A string in SGF format |
| 34 | |
| 35 | Returns: |
| 36 | SgfTree: The parsed SGF tree |
| 37 | |
| 38 | Raises: |
| 39 | ValueError: If the input is invalid according to SGF format rules |
| 40 | """ |
| 41 | # Edge Case: Empty input |
| 42 | if not input_string: |
| 43 | raise ValueError("tree missing") |
| 44 | |
| 45 | # Edge Case: Missing opening parenthesis |
| 46 | if not input_string.startswith('('): |
| 47 | raise ValueError("tree missing") |
| 48 | |
| 49 | # Edge Case: Missing closing parenthesis |
| 50 | if not input_string.endswith(')'): |
| 51 | raise ValueError("tree missing") |
| 52 | |
| 53 | # Remove the outer parentheses |
| 54 | content = input_string[1:-1] |
| 55 | |
| 56 | # Edge Case: Empty tree with no nodes |
| 57 | if not content: |
| 58 | raise ValueError("tree with no nodes") |
| 59 | |
| 60 | # Edge Case: Missing semicolon after opening parenthesis |
| 61 | if not content.startswith(';'): |
| 62 | raise ValueError("tree missing") |
| 63 | |
| 64 | return _parse_tree(content) |
| 65 | |
| 66 | |
| 67 | def _parse_tree(content: str) -> SgfTree: |
| 68 | """ |
| 69 | Parse the content of a tree (without outer parentheses). |
| 70 | |
| 71 | Args: |
| 72 | content: The content of the tree |
| 73 | |
| 74 | Returns: |
| 75 | SgfTree: The parsed tree |
| 76 | """ |
| 77 | # Skip the initial semicolon |
| 78 | index = 1 |
| 79 | |
| 80 | # Parse properties of the current node |
| 81 | properties, index = _parse_properties(content, index) |
| 82 | |
| 83 | # Parse children |
| 84 | children = [] |
| 85 | while index < len(content): |
| 86 | if content[index] == '(': # Start of a child tree |
| 87 | child_tree, consumed = _parse_subtree(content[index:]) |
| 88 | children.append(child_tree) |
| 89 | index += consumed |
| 90 | elif content[index] == ';': # Start of a sequential node |
| 91 | # Create a new node with the remaining content |
| 92 | remaining_content = content[index:] |
| 93 | child_tree, consumed = _parse_sequential_node(remaining_content) |
| 94 | children.append(child_tree) |
| 95 | index += consumed |
| 96 | break # Sequential nodes form a chain, so we only process one here |
| 97 | else: |
| 98 | # This shouldn't happen in a valid SGF |
| 99 | break |
| 100 | |
| 101 | return SgfTree(properties=properties, children=children) |
| 102 | |
| 103 | |
| 104 | def _parse_sequential_node(content: str) -> tuple[SgfTree, int]: |
| 105 | """ |
| 106 | Parse a sequential node (starting with semicolon). |
| 107 | |
| 108 | Args: |
| 109 | content: The content starting with ';' |
| 110 | |
| 111 | Returns: |
| 112 | tuple: (parsed tree, number of characters consumed) |
| 113 | """ |
| 114 | # Skip the semicolon |
| 115 | index = 1 |
| 116 | |
| 117 | # Parse properties of the current node |
| 118 | properties, index = _parse_properties(content, index) |
| 119 | |
| 120 | # Parse children |
| 121 | children = [] |
| 122 | consumed = index |
| 123 | |
| 124 | # Check if there are more nodes or subtrees |
| 125 | if index < len(content): |
| 126 | if content[index] == '(': # Start of a child tree |
| 127 | child_tree, child_consumed = _parse_subtree(content[index:]) |
| 128 | children.append(child_tree) |
| 129 | consumed += child_consumed |
| 130 | elif content[index] == ';': # Start of another sequential node |
| 131 | child_tree, child_consumed = _parse_sequential_node(content[index:]) |
| 132 | children.append(child_tree) |
| 133 | consumed += child_consumed |
| 134 | |
| 135 | return SgfTree(properties=properties, children=children), consumed |
| 136 | |
| 137 | |
| 138 | def _parse_subtree(content: str) -> tuple[SgfTree, int]: |
| 139 | """ |
| 140 | Parse a subtree (including its parentheses). |
| 141 | |
| 142 | Args: |
| 143 | content: The content starting with '(' |
| 144 | |
| 145 | Returns: |
| 146 | tuple: (parsed tree, number of characters consumed) |
| 147 | """ |
| 148 | # Find matching closing parenthesis |
| 149 | balance = 0 |
| 150 | consumed = 0 |
| 151 | |
| 152 | for i, char in enumerate(content): |
| 153 | if char == '(': |
| 154 | balance += 1 |
| 155 | elif char == ')': |
| 156 | balance -= 1 |
| 157 | if balance == 0: |
| 158 | consumed = i + 1 |
| 159 | break |
| 160 | |
| 161 | # Edge Case: Mismatched parentheses |
| 162 | if balance != 0: |
| 163 | raise ValueError("tree missing") |
| 164 | |
| 165 | # Parse the content inside the parentheses |
| 166 | inner_content = content[1:consumed-1] |
| 167 | |
| 168 | # Edge Case: Empty subtree |
| 169 | if not inner_content: |
| 170 | raise ValueError("tree with no nodes") |
| 171 | |
| 172 | # Edge Case: Missing semicolon |
| 173 | if not inner_content.startswith(';'): |
| 174 | raise ValueError("tree missing") |
| 175 | |
| 176 | tree = _parse_tree(inner_content) |
| 177 | return tree, consumed |
| 178 | |
| 179 | |
| 180 | def _parse_properties(content: str, start_index: int) -> tuple[dict, int]: |
| 181 | """ |
| 182 | Parse properties from the content starting at start_index. |
| 183 | |
| 184 | Args: |
| 185 | content: The content to parse |
| 186 | start_index: The index to start parsing from |
| 187 | |
| 188 | Returns: |
| 189 | tuple: (properties dictionary, next index to parse) |
| 190 | """ |
| 191 | properties = {} |
| 192 | index = start_index |
| 193 | |
| 194 | while index < len(content) and content[index] not in '()': |
| 195 | # Parse key |
| 196 | key_start = index |
| 197 | while index < len(content) and content[index].isalpha(): |
| 198 | index += 1 |
| 199 | |
| 200 | # Edge Case: Property key must be uppercase |
| 201 | key = content[key_start:index] |
| 202 | if not key.isupper(): |
| 203 | raise ValueError("property must be in uppercase") |
| 204 | |
| 205 | # Edge Case: Missing key |
| 206 | if not key: |
| 207 | raise ValueError("properties without delimiter") |
| 208 | |
| 209 | # Parse values |
| 210 | values = [] |
| 211 | |
| 212 | # There must be at least one value |
| 213 | if index >= len(content) or content[index] != '[': |
| 214 | raise ValueError("properties without delimiter") |
| 215 | |
| 216 | while index < len(content) and content[index] == '[': |
| 217 | index += 1 # Skip '[' |
| 218 | value_start = index |
| 219 | |
| 220 | # Parse value, handling escapes |
| 221 | while index < len(content) and content[index] != ']': |
| 222 | if content[index] == '\\': |
| 223 | index += 2 # Skip escape and next character |
| 224 | else: |
| 225 | index += 1 |
| 226 | |
| 227 | # Edge Case: Unclosed value bracket |
| 228 | if index >= len(content) or content[index] != ']': |
| 229 | raise ValueError("properties without delimiter") |
| 230 | |
| 231 | value = content[value_start:index] |
| 232 | value = _unescape_value(value) |
| 233 | values.append(value) |
| 234 | index += 1 # Skip ']' |
| 235 | |
| 236 | properties[key] = values |
| 237 | |
| 238 | return properties, index |
| 239 | |
| 240 | |
| 241 | def _unescape_value(value: str) -> str: |
| 242 | """ |
| 243 | Process escape sequences in a value according to SGF text type rules. |
| 244 | |
| 245 | Args: |
| 246 | value: The raw value string |
| 247 | |
| 248 | Returns: |
| 249 | str: The processed value |
| 250 | """ |
| 251 | result = [] |
| 252 | i = 0 |
| 253 | while i < len(value): |
| 254 | if value[i] == '\\': |
| 255 | i += 1 |
| 256 | if i < len(value): |
| 257 | # Handle newline removal |
| 258 | if value[i] == '\n': |
| 259 | # Newline immediately after backslash - remove both |
| 260 | i += 1 |
| 261 | elif value[i].isspace(): |
| 262 | # Other whitespace after backslash becomes space |
| 263 | result.append(' ') |
| 264 | i += 1 |
| 265 | elif value[i] == ']': |
| 266 | # Closing bracket - just insert the bracket |
| 267 | result.append(']') |
| 268 | i += 1 |
| 269 | else: |
| 270 | # Any other character - becomes literal \ + character |
| 271 | result.append('\\' + value[i]) |
| 272 | i += 1 |
| 273 | else: |
| 274 | # Trailing backslash - keep as-is |
| 275 | result.append('\\') |
| 276 | else: |
| 277 | # All whitespace other than newline becomes space |
| 278 | if value[i] != '\n' and value[i].isspace(): |
| 279 | result.append(' ') |
| 280 | else: |
| 281 | result.append(value[i]) |
| 282 | i += 1 |
| 283 | |
| 284 | return ''.join(result) |
| 285 | # Handled Edge Cases: Empty input, missing opening parenthesis, missing closing parenthesis, empty tree with no nodes, missing semicolon after opening parenthesis, mismatched parentheses, empty subtree, missing semicolon in subtree, property key must be uppercase, missing key, missing delimiter for properties, unclosed value bracket |
| 286 | # Handled Edge Cases: Property key must be uppercase, missing key, missing delimiter for properties, unclosed value bracket |
| 287 | # Handled Edge Cases: Mismatched parentheses, empty subtree, missing semicolon |
| 288 | # Handled Edge Cases: Empty input, missing opening parenthesis, missing closing parenthesis, empty tree with no nodes, missing semicolon after opening parenthesis |