| 26 | 26 | |
| 27 | 27 | |
| 28 | 28 | def parse(input_string: str) -> SgfTree: |
| 29 | | pass |
| 29 | if not input_string: |
| 30 | raise ValueError("tree missing") |
| 31 | |
| 32 | if not input_string.startswith('(') or not input_string.endswith(')'): |
| 33 | raise ValueError("tree missing") |
| 34 | |
| 35 | # Remove outer parentheses |
| 36 | content = input_string[1:-1] |
| 37 | |
| 38 | if not content: |
| 39 | raise ValueError("tree missing") |
| 40 | |
| 41 | # Check if content starts with semicolon (required for a valid tree) |
| 42 | if not content.startswith(';'): |
| 43 | raise ValueError("tree missing") |
| 44 | |
| 45 | return _parse_tree(content) |
| 46 | |
| 47 | |
| 48 | def _parse_tree(content: str) -> SgfTree: |
| 49 | if not content.startswith(';'): |
| 50 | # This must be a node list in parentheses |
| 51 | return _parse_variations(content) |
| 52 | |
| 53 | # Parse the first node |
| 54 | node_content, rest = _parse_node_content(content[1:]) # Skip the ';' |
| 55 | properties = _parse_properties(node_content) |
| 56 | |
| 57 | # Check if this is an empty node (no properties) |
| 58 | if not properties and not rest: |
| 59 | raise ValueError("tree with no nodes") |
| 60 | |
| 61 | children = [] |
| 62 | # Parse any remaining children |
| 63 | while rest: |
| 64 | if rest.startswith('('): |
| 65 | # This is a variation - check if it contains a semicolon at the base level |
| 66 | variation_end = _find_variation_end(rest) |
| 67 | variation_content = rest[1:variation_end-1] # Exclude parentheses |
| 68 | |
| 69 | # Check if there's a semicolon in the variation content at base level |
| 70 | # Only split if the semicolon comes after nested variations (parentheses) |
| 71 | semicolon_pos = _find_semicolon_at_level(variation_content) |
| 72 | if semicolon_pos != -1 and '(' in variation_content[:semicolon_pos]: |
| 73 | # Split the variation content at the semicolon |
| 74 | first_part = variation_content[:semicolon_pos] |
| 75 | second_part = variation_content[semicolon_pos:] |
| 76 | |
| 77 | # Parse the first part as a variation |
| 78 | first_variation = _parse_tree(first_part) |
| 79 | children.append(first_variation) |
| 80 | |
| 81 | # Parse the second part and add it as a child |
| 82 | second_tree = _parse_tree(second_part) |
| 83 | children.append(second_tree) |
| 84 | |
| 85 | rest = rest[variation_end:] # Skip the entire variation |
| 86 | else: |
| 87 | # Normal variation parsing |
| 88 | variation, rest = _parse_variation(rest) |
| 89 | children.append(variation) |
| 90 | elif rest.startswith(';'): |
| 91 | # This is a direct child node |
| 92 | child, rest = _parse_single_node(rest) |
| 93 | children.append(child) |
| 94 | else: |
| 95 | break |
| 96 | |
| 97 | return SgfTree(properties=properties, children=children) |
| 98 | |
| 99 | |
| 100 | def _parse_node_content(content: str) -> tuple[str, str]: |
| 101 | # Find the end of this node's properties |
| 102 | i = 0 |
| 103 | while i < len(content): |
| 104 | if content[i] == '(' or content[i] == ';': |
| 105 | # Start of child node |
| 106 | return content[:i], content[i:] |
| 107 | i += 1 |
| 108 | return content, "" |
| 109 | |
| 110 | |
| 111 | def _parse_properties(content: str) -> dict: |
| 112 | properties = {} |
| 113 | i = 0 |
| 114 | |
| 115 | # Check if content is empty - this means no properties |
| 116 | if not content.strip(): |
| 117 | return properties |
| 118 | |
| 119 | while i < len(content): |
| 120 | # Skip any whitespace before key |
| 121 | while i < len(content) and content[i].isspace(): |
| 122 | i += 1 |
| 123 | |
| 124 | if i >= len(content): |
| 125 | break |
| 126 | |
| 127 | # Parse key |
| 128 | if not content[i].isupper(): |
| 129 | raise ValueError("property must be in uppercase") |
| 130 | |
| 131 | key_start = i |
| 132 | while i < len(content) and content[i].isupper(): |
| 133 | i += 1 |
| 134 | |
| 135 | if i == key_start: |
| 136 | raise ValueError("properties without delimiter") |
| 137 | |
| 138 | key = content[key_start:i] |
| 139 | |
| 140 | # Parse values |
| 141 | if i >= len(content) or content[i] != '[': |
| 142 | raise ValueError("properties without delimiter") |
| 143 | |
| 144 | values = [] |
| 145 | while i < len(content) and content[i] == '[': |
| 146 | i += 1 # Skip '[' |
| 147 | value_start = i |
| 148 | # Find matching ']' - handle escaped brackets |
| 149 | bracket_count = 1 |
| 150 | while i < len(content) and bracket_count > 0: |
| 151 | if content[i] == '\\': |
| 152 | # Skip escaped character |
| 153 | i += 2 if i + 1 < len(content) else 1 |
| 154 | elif content[i] == '[': |
| 155 | bracket_count += 1 |
| 156 | i += 1 |
| 157 | elif content[i] == ']': |
| 158 | bracket_count -= 1 |
| 159 | if bracket_count > 0: |
| 160 | i += 1 |
| 161 | else: |
| 162 | i += 1 |
| 163 | |
| 164 | if i >= len(content): |
| 165 | raise ValueError("properties without delimiter") |
| 166 | |
| 167 | value = _parse_text(content[value_start:i]) |
| 168 | values.append(value) |
| 169 | i += 1 # Skip ']' |
| 170 | |
| 171 | properties[key] = values |
| 172 | |
| 173 | return properties |
| 174 | |
| 175 | |
| 176 | def _parse_text(text: str) -> str: |
| 177 | result = [] |
| 178 | i = 0 |
| 179 | while i < len(text): |
| 180 | if text[i] == '\\': |
| 181 | # Handle escape sequence |
| 182 | i += 1 |
| 183 | if i < len(text): |
| 184 | char = text[i] |
| 185 | # Handle specific escape sequences |
| 186 | if char == 'n': |
| 187 | result.append('\n') |
| 188 | elif char == 't': |
| 189 | result.append('\t') |
| 190 | elif char == 'r': |
| 191 | result.append('\r') |
| 192 | elif char == '\\': |
| 193 | result.append('\\') |
| 194 | elif char in ' \t\n': |
| 195 | # Whitespace after \ follows normal rules |
| 196 | if char == '\n': |
| 197 | # Newline immediately after \ is removed |
| 198 | pass |
| 199 | else: |
| 200 | # Other whitespace becomes space |
| 201 | result.append(' ') |
| 202 | else: |
| 203 | # Non-whitespace character is inserted as-is |
| 204 | result.append(char) |
| 205 | else: |
| 206 | # Backslash at end of text |
| 207 | result.append('\\') |
| 208 | else: |
| 209 | # Handle normal characters |
| 210 | if text[i] == '\n': |
| 211 | result.append('\n') |
| 212 | elif text[i] in ' \t': |
| 213 | # Convert whitespace to space |
| 214 | result.append(' ') |
| 215 | else: |
| 216 | result.append(text[i]) |
| 217 | i += 1 |
| 218 | |
| 219 | # Now collapse consecutive spaces, but preserve leading and trailing |
| 220 | result_str = ''.join(result) |
| 221 | |
| 222 | # Find first non-space character |
| 223 | first_non_space = 0 |
| 224 | while first_non_space < len(result_str) and result_str[first_non_space] == ' ': |
| 225 | first_non_space += 1 |
| 226 | |
| 227 | # Find last non-space character |
| 228 | last_non_space = len(result_str) - 1 |
| 229 | while last_non_space >= 0 and result_str[last_non_space] == ' ': |
| 230 | last_non_space -= 1 |
| 231 | |
| 232 | if first_non_space >= len(result_str): # All spaces |
| 233 | return result_str |
| 234 | |
| 235 | # Preserve leading and trailing spaces, collapse middle |
| 236 | leading = result_str[:first_non_space] |
| 237 | middle = result_str[first_non_space:last_non_space + 1] |
| 238 | trailing = result_str[last_non_space + 1:] |
| 239 | |
| 240 | # Collapse consecutive spaces in middle |
| 241 | collapsed_middle = [] |
| 242 | prev_was_space = False |
| 243 | for char in middle: |
| 244 | if char == ' ': |
| 245 | if not prev_was_space: |
| 246 | collapsed_middle.append(' ') |
| 247 | prev_was_space = True |
| 248 | else: |
| 249 | collapsed_middle.append(char) |
| 250 | prev_was_space = False |
| 251 | |
| 252 | return leading + ''.join(collapsed_middle) + trailing |
| 253 | |
| 254 | |
| 255 | def _find_variation_end(content: str) -> int: |
| 256 | """Find the position of the closing parenthesis that matches the first opening one.""" |
| 257 | depth = 1 |
| 258 | i = 1 |
| 259 | while i < len(content) and depth > 0: |
| 260 | if content[i] == '(': |
| 261 | depth += 1 |
| 262 | elif content[i] == ')': |
| 263 | depth -= 1 |
| 264 | i += 1 |
| 265 | return i |
| 266 | |
| 267 | |
| 268 | def _find_semicolon_at_level(content: str) -> int: |
| 269 | """Find the position of the first semicolon at the base level (not in nested variations), |
| 270 | skipping the initial semicolon that starts the node.""" |
| 271 | depth = 0 |
| 272 | i = 0 |
| 273 | # Skip the initial semicolon that starts the node |
| 274 | if i < len(content) and content[i] == ';': |
| 275 | i += 1 |
| 276 | |
| 277 | while i < len(content): |
| 278 | if content[i] == '(': |
| 279 | depth += 1 |
| 280 | elif content[i] == ')': |
| 281 | depth -= 1 |
| 282 | elif content[i] == ';' and depth == 0: |
| 283 | return i |
| 284 | i += 1 |
| 285 | return -1 |
| 286 | |
| 287 | |
| 288 | def _parse_variation(content: str) -> tuple[SgfTree, str]: |
| 289 | if not content.startswith('('): |
| 290 | raise ValueError("tree missing") |
| 291 | |
| 292 | # Find matching parenthesis - the variation ends at the first ')' |
| 293 | # that brings the depth back to 0 |
| 294 | depth = 1 |
| 295 | i = 1 |
| 296 | while i < len(content) and depth > 0: |
| 297 | if content[i] == '(': |
| 298 | depth += 1 |
| 299 | elif content[i] == ')': |
| 300 | depth -= 1 |
| 301 | i += 1 |
| 302 | |
| 303 | if depth > 0: |
| 304 | raise ValueError("tree missing") |
| 305 | |
| 306 | variation_content = content[1:i-1] # Exclude parentheses |
| 307 | rest = content[i:] |
| 308 | |
| 309 | return _parse_tree(variation_content), rest |
| 310 | |
| 311 | |
| 312 | def _parse_variations(content: str) -> SgfTree: |
| 313 | """Parse a sequence of variations (node list in parentheses).""" |
| 314 | variations = [] |
| 315 | rest = content |
| 316 | while rest.startswith('('): |
| 317 | variation, rest = _parse_variation(rest) |
| 318 | variations.append(variation) |
| 319 | |
| 320 | if not variations: |
| 321 | raise ValueError("tree with no nodes") |
| 322 | |
| 323 | # Return the first variation as the main tree, with others as children |
| 324 | main_tree = variations[0] |
| 325 | # If there are more variations, they become siblings (children of the parent) |
| 326 | # But since we're returning a single tree, we need to handle this differently |
| 327 | # In SGF, (;A[B])(;C[D]) means two separate trees, but we're parsing a single tree |
| 328 | # So we'll make the first one the main tree and others as its children if needed |
| 329 | # But actually, if we're at the top level and have multiple variations with no parent, |
| 330 | # that's a special case that should be handled by the caller |
| 331 | |
| 332 | # For our purposes, we'll return the first variation |
| 333 | return main_tree |
| 334 | |
| 335 | |
| 336 | def _parse_single_node(content: str) -> tuple[SgfTree, str]: |
| 337 | if not content.startswith(';'): |
| 338 | raise ValueError("tree missing") |
| 339 | |
| 340 | node_content, rest = _parse_node_content(content[1:]) # Skip ';' |
| 341 | properties = _parse_properties(node_content) |
| 342 | |
| 343 | children = [] |
| 344 | # Parse any remaining children |
| 345 | while rest: |
| 346 | if rest.startswith('('): |
| 347 | variation, rest = _parse_variation(rest) |
| 348 | children.append(variation) |
| 349 | elif rest.startswith(';'): |
| 350 | child, rest = _parse_single_node(rest) |
| 351 | children.append(child) |
| 352 | else: |
| 353 | break |
| 354 | |
| 355 | return SgfTree(properties=properties, children=children), rest |
| 356 | |
| 357 | # Handled Edge Cases: empty input, missing tree structure, properties without proper delimiters, lowercase property keys, malformed node structures, incorrect bracket matching, escape sequence handling, multiple variations with complex nesting, properties with multiple values, whitespace and newline handling in text values |