| 26 | 26 | |
| 27 | 27 | |
| 28 | 28 | def parse(input_string: str) -> SgfTree: |
| 29 | | pass |
| 29 | """ |
| 30 | Parse an SGF string and return a tree structure of properties. |
| 31 | |
| 32 | Args: |
| 33 | input_string: A string containing SGF formatted data |
| 34 | |
| 35 | Returns: |
| 36 | SgfTree: A tree structure representing the SGF data |
| 37 | |
| 38 | Raises: |
| 39 | ValueError: If the input is malformed or doesn't conform to SGF format |
| 40 | """ |
| 41 | # Edge Case: Empty input or input that doesn't start with '(' |
| 42 | if not input_string or not input_string.startswith('('): |
| 43 | raise ValueError("tree missing") |
| 44 | |
| 45 | # Edge Case: Input that starts with '(' but doesn't end with ')' |
| 46 | if not input_string.endswith(')'): |
| 47 | raise ValueError("tree missing") |
| 48 | |
| 49 | # Remove the outer parentheses |
| 50 | content = input_string[1:-1] |
| 51 | |
| 52 | # Edge Case: Empty content after removing parentheses |
| 53 | if not content: |
| 54 | raise ValueError("tree with no nodes") |
| 55 | |
| 56 | # Edge Case: Content that doesn't start with ';' |
| 57 | if not content.startswith(';'): |
| 58 | raise ValueError("tree missing") |
| 59 | |
| 60 | return _parse_tree(content) |
| 61 | |
| 62 | |
| 63 | def _parse_tree(content: str) -> SgfTree: |
| 64 | """ |
| 65 | Parse a tree from SGF content. |
| 66 | |
| 67 | Args: |
| 68 | content: SGF content string starting with ';' |
| 69 | |
| 70 | Returns: |
| 71 | SgfTree: Parsed tree structure |
| 72 | """ |
| 73 | # Skip the initial ';' |
| 74 | content = content[1:] |
| 75 | |
| 76 | # Parse properties |
| 77 | properties, remaining = _parse_properties(content) |
| 78 | |
| 79 | children = [] |
| 80 | |
| 81 | # Parse children |
| 82 | while remaining: |
| 83 | if remaining.startswith('('): |
| 84 | # Parse a child tree |
| 85 | child_tree, remaining = _parse_subtree(remaining) |
| 86 | children.append(child_tree) |
| 87 | else: |
| 88 | # No more children |
| 89 | break |
| 90 | |
| 91 | return SgfTree(properties=properties, children=children) |
| 92 | |
| 93 | |
| 94 | def _parse_subtree(content: str) -> tuple[SgfTree, str]: |
| 95 | """ |
| 96 | Parse a subtree from content starting with '('. |
| 97 | |
| 98 | Args: |
| 99 | content: Content starting with '(' |
| 100 | |
| 101 | Returns: |
| 102 | Tuple of (parsed tree, remaining content) |
| 103 | """ |
| 104 | # Find matching closing parenthesis |
| 105 | depth = 0 |
| 106 | end_index = -1 |
| 107 | |
| 108 | for i, char in enumerate(content): |
| 109 | if char == '(': |
| 110 | depth += 1 |
| 111 | elif char == ')': |
| 112 | depth -= 1 |
| 113 | if depth == 0: |
| 114 | end_index = i |
| 115 | break |
| 116 | |
| 117 | # Edge Case: Unmatched parenthesis |
| 118 | if end_index == -1: |
| 119 | raise ValueError("tree missing") |
| 120 | |
| 121 | # Extract subtree content |
| 122 | subtree_content = content[1:end_index] |
| 123 | |
| 124 | # Edge Case: Empty subtree |
| 125 | if not subtree_content: |
| 126 | raise ValueError("tree with no nodes") |
| 127 | |
| 128 | # Edge Case: Subtree doesn't start with ';' |
| 129 | if not subtree_content.startswith(';'): |
| 130 | raise ValueError("tree missing") |
| 131 | |
| 132 | tree = _parse_tree(subtree_content) |
| 133 | |
| 134 | return tree, content[end_index + 1:] |
| 135 | |
| 136 | |
| 137 | def _parse_properties(content: str) -> tuple[dict, str]: |
| 138 | """ |
| 139 | Parse properties from content. |
| 140 | |
| 141 | Args: |
| 142 | content: Content to parse properties from |
| 143 | |
| 144 | Returns: |
| 145 | Tuple of (properties dictionary, remaining content) |
| 146 | """ |
| 147 | properties = {} |
| 148 | remaining = content |
| 149 | |
| 150 | while remaining and remaining[0].isalpha() and remaining[0].isupper(): |
| 151 | # Parse key |
| 152 | key_end = 0 |
| 153 | while key_end < len(remaining) and remaining[key_end].isalpha() and remaining[key_end].isupper(): |
| 154 | key_end += 1 |
| 155 | |
| 156 | # Edge Case: Key with no values |
| 157 | if key_end >= len(remaining) or remaining[key_end] != '[': |
| 158 | raise ValueError("properties without delimiter") |
| 159 | |
| 160 | key = remaining[:key_end] |
| 161 | remaining = remaining[key_end:] |
| 162 | |
| 163 | # Parse values |
| 164 | values = [] |
| 165 | while remaining.startswith('['): |
| 166 | # Find matching closing bracket |
| 167 | value_end = -1 |
| 168 | escape = False |
| 169 | |
| 170 | for i, char in enumerate(remaining[1:], 1): # Start from index 1 to skip the opening '[' |
| 171 | if escape: |
| 172 | escape = False |
| 173 | continue |
| 174 | if char == '\\': |
| 175 | escape = True |
| 176 | elif char == ']': |
| 177 | value_end = i |
| 178 | break |
| 179 | |
| 180 | # Edge Case: Unmatched bracket |
| 181 | if value_end == -1: |
| 182 | raise ValueError("properties without delimiter") |
| 183 | |
| 184 | # Extract and process value |
| 185 | value = remaining[1:value_end] |
| 186 | processed_value = _process_text_value(value) |
| 187 | values.append(processed_value) |
| 188 | |
| 189 | # Move past this value |
| 190 | remaining = remaining[value_end + 1:] |
| 191 | |
| 192 | # Edge Case: No values found for key |
| 193 | if not values: |
| 194 | raise ValueError("properties without delimiter") |
| 195 | |
| 196 | properties[key] = values |
| 197 | |
| 198 | return properties, remaining |
| 199 | |
| 200 | |
| 201 | def _process_text_value(value: str) -> str: |
| 202 | """ |
| 203 | Process an SGF text value according to SGF specification. |
| 204 | |
| 205 | Args: |
| 206 | value: Raw text value from SGF |
| 207 | |
| 208 | Returns: |
| 209 | Processed text value |
| 210 | """ |
| 211 | result = [] |
| 212 | i = 0 |
| 213 | |
| 214 | while i < len(value): |
| 215 | char = value[i] |
| 216 | |
| 217 | if char == '\\': |
| 218 | # Handle escape sequence |
| 219 | i += 1 |
| 220 | if i < len(value): |
| 221 | next_char = value[i] |
| 222 | if next_char == 'n': |
| 223 | # Special case: \n should be treated as literal 'n' if not followed by newline |
| 224 | result.append('n') |
| 225 | elif next_char == 't': |
| 226 | # Special case: \t should be treated as literal 't' |
| 227 | result.append('t') |
| 228 | elif next_char == '\\': |
| 229 | # Escaped backslash |
| 230 | result.append('\\') |
| 231 | elif next_char == ']': |
| 232 | # Escaped closing bracket |
| 233 | result.append(']') |
| 234 | elif next_char == 'r': |
| 235 | # Special case: \r should be treated as literal 'r' |
| 236 | result.append('r') |
| 237 | elif next_char.isspace(): |
| 238 | # Whitespace after backslash |
| 239 | if next_char == '\n': |
| 240 | # If newline immediately follows backslash, remove it |
| 241 | pass |
| 242 | else: |
| 243 | # Other whitespace becomes space |
| 244 | result.append(' ') |
| 245 | else: |
| 246 | # Any other character after backslash is inserted as-is |
| 247 | result.append(next_char) |
| 248 | else: |
| 249 | # Backslash at end of string |
| 250 | result.append('\\') |
| 251 | elif char == '\n' and i > 0 and value[i-1] == '\\': |
| 252 | # Newline immediately after backslash should be removed |
| 253 | # This case is handled above |
| 254 | result.append(char) |
| 255 | elif char.isspace() and char != '\n': |
| 256 | # All whitespace other than newline becomes space |
| 257 | result.append(' ') |
| 258 | else: |
| 259 | result.append(char) |
| 260 | |
| 261 | i += 1 |
| 262 | |
| 263 | # Join and handle newlines that come immediately after backslash |
| 264 | processed = ''.join(result) |
| 265 | |
| 266 | # Remove newlines that come immediately after backslash |
| 267 | final_result = [] |
| 268 | i = 0 |
| 269 | while i < len(processed): |
| 270 | if processed[i] == '\\' and i + 1 < len(processed) and processed[i + 1] == '\n': |
| 271 | # Skip the newline |
| 272 | i += 2 |
| 273 | else: |
| 274 | final_result.append(processed[i]) |
| 275 | i += 1 |
| 276 | |
| 277 | return ''.join(final_result) |
| 278 | |
| 279 | # Handled Edge Cases: Empty input, missing opening parenthesis, missing closing parenthesis, empty content after parentheses, missing semicolon, unmatched parenthesis, empty subtree, subtree not starting with semicolon, key with no values, unmatched bracket, no values for key |
| 280 | # Handled Edge Cases: Empty input, missing opening parenthesis, missing closing parenthesis |
| 281 | # Handled Edge Cases: Empty content after removing parentheses, content not starting with semicolon |
| 282 | # Handled Edge Cases: Unmatched parenthesis, empty subtree, subtree not starting with semicolon |
| 283 | # Handled Edge Cases: Key with no values, unmatched bracket, no values for key |