| 26 | 26 | |
| 27 | 27 | |
| 28 | 28 | def parse(input_string: str) -> SgfTree: |
| 29 | | pass |
| 29 | """ |
| 30 | Parse an SGF string and return a tree structure of properties. |
| 31 | |
| 32 | Args: |
| 33 | input_string: A string in SGF format |
| 34 | |
| 35 | Returns: |
| 36 | SgfTree: The parsed SGF tree |
| 37 | |
| 38 | Raises: |
| 39 | ValueError: If the input is invalid according to SGF format rules |
| 40 | """ |
| 41 | # Edge Case: Empty input |
| 42 | if not input_string: |
| 43 | raise ValueError("tree missing") |
| 44 | |
| 45 | # Edge Case: Missing opening parenthesis |
| 46 | if not input_string.startswith('('): |
| 47 | raise ValueError("tree missing") |
| 48 | |
| 49 | # Edge Case: Missing closing parenthesis |
| 50 | if not input_string.endswith(')'): |
| 51 | raise ValueError("tree missing") |
| 52 | |
| 53 | # Remove the outer parentheses |
| 54 | content = input_string[1:-1] |
| 55 | |
| 56 | # Edge Case: Empty tree with no nodes |
| 57 | if not content: |
| 58 | raise ValueError("tree with no nodes") |
| 59 | |
| 60 | # Edge Case: Missing semicolon after opening parenthesis |
| 61 | if not content.startswith(';'): |
| 62 | raise ValueError("tree missing") |
| 63 | |
| 64 | return _parse_tree(content) |
| 65 | |
| 66 | |
| 67 | def _parse_tree(content: str) -> SgfTree: |
| 68 | """ |
| 69 | Parse the content of a tree (without outer parentheses). |
| 70 | |
| 71 | Args: |
| 72 | content: The content of the tree |
| 73 | |
| 74 | Returns: |
| 75 | SgfTree: The parsed tree |
| 76 | """ |
| 77 | # Skip the initial semicolon |
| 78 | index = 1 |
| 79 | |
| 80 | # Parse properties of the current node |
| 81 | properties, index = _parse_properties(content, index) |
| 82 | |
| 83 | # Parse children |
| 84 | children = [] |
| 85 | while index < len(content): |
| 86 | if content[index] == '(': # Start of a child tree (variation) |
| 87 | # Find the matching closing parenthesis |
| 88 | paren_count = 1 |
| 89 | start = index |
| 90 | index += 1 |
| 91 | while index < len(content) and paren_count > 0: |
| 92 | if content[index] == '(': |
| 93 | paren_count += 1 |
| 94 | elif content[index] == ')': |
| 95 | paren_count -= 1 |
| 96 | index += 1 |
| 97 | |
| 98 | # Edge Case: Malformed parentheses |
| 99 | if paren_count != 0: |
| 100 | raise ValueError("tree missing") |
| 101 | |
| 102 | # Parse the child tree |
| 103 | child_content = content[start:index] |
| 104 | children.append(parse(child_content)) |
| 105 | elif content[index] == ';': # Start of a sequential child node |
| 106 | # Parse the sequential node and its children |
| 107 | child_node, index = _parse_sequential_node_from_position(content, index) |
| 108 | children.append(child_node) |
| 109 | else: |
| 110 | # Edge Case: Unexpected character |
| 111 | raise ValueError("tree missing") |
| 112 | |
| 113 | return SgfTree(properties=properties, children=children) |
| 114 | |
| 115 | |
| 116 | def _parse_properties(content: str, start_index: int) -> tuple[dict, int]: |
| 117 | """ |
| 118 | Parse properties from the content starting at start_index. |
| 119 | |
| 120 | Args: |
| 121 | content: The content to parse |
| 122 | start_index: The index to start parsing from |
| 123 | |
| 124 | Returns: |
| 125 | tuple: A tuple of (properties_dict, next_index) |
| 126 | """ |
| 127 | properties = {} |
| 128 | index = start_index |
| 129 | |
| 130 | while index < len(content) and content[index] not in ['(', ')', ';']: |
| 131 | # Expecting a property in format Key[Value] |
| 132 | # Parse the key |
| 133 | key_start = index |
| 134 | |
| 135 | # Edge Case: Key must be uppercase |
| 136 | if not content[index].isalpha() or not content[index].isupper(): |
| 137 | raise ValueError("property must be in uppercase") |
| 138 | |
| 139 | while index < len(content) and content[index].isalpha() and content[index].isupper(): |
| 140 | index += 1 |
| 141 | |
| 142 | # Edge Case: Empty key |
| 143 | if index == key_start: |
| 144 | raise ValueError("properties without delimiter") |
| 145 | |
| 146 | key = content[key_start:index] |
| 147 | |
| 148 | # Edge Case: Missing opening bracket for value |
| 149 | if index >= len(content) or content[index] != '[': |
| 150 | raise ValueError("properties without delimiter") |
| 151 | |
| 152 | # Parse all values for this key |
| 153 | values = [] |
| 154 | while index < len(content) and content[index] == '[': |
| 155 | index += 1 # Skip the opening bracket |
| 156 | value_start = index |
| 157 | |
| 158 | # Parse the value until closing bracket |
| 159 | while index < len(content) and content[index] != ']': |
| 160 | index += 1 |
| 161 | |
| 162 | # Edge Case: Missing closing bracket |
| 163 | if index >= len(content) or content[index] != ']': |
| 164 | raise ValueError("properties without delimiter") |
| 165 | |
| 166 | value = content[value_start:index] |
| 167 | # Process the value according to SGF text type rules |
| 168 | value = _process_text_value(value) |
| 169 | values.append(value) |
| 170 | index += 1 # Skip the closing bracket |
| 171 | |
| 172 | # Add the property |
| 173 | properties[key] = values |
| 174 | |
| 175 | return properties, index |
| 176 | |
| 177 | |
| 178 | def _parse_sequential_node_from_position(content: str, start_index: int) -> tuple[SgfTree, int]: |
| 179 | """ |
| 180 | Parse a sequential node starting from a specific position. |
| 181 | |
| 182 | Args: |
| 183 | content: The content to parse |
| 184 | start_index: The index where the semicolon is located |
| 185 | |
| 186 | Returns: |
| 187 | tuple: (SgfTree node, new index position) |
| 188 | """ |
| 189 | # Edge Case: Must start with semicolon |
| 190 | if content[start_index] != ';': |
| 191 | raise ValueError("tree missing") |
| 192 | |
| 193 | # Skip the initial semicolon |
| 194 | index = start_index + 1 |
| 195 | |
| 196 | # Parse properties of the current node |
| 197 | properties, index = _parse_properties(content, index) |
| 198 | |
| 199 | # Parse children (if any) |
| 200 | children = [] |
| 201 | while index < len(content) and content[index] not in [')', '(']: |
| 202 | if content[index] == ';': # Start of a sequential child node |
| 203 | # Parse the next sequential node |
| 204 | child_node, index = _parse_sequential_node_from_position(content, index) |
| 205 | children.append(child_node) |
| 206 | else: |
| 207 | # Edge Case: Unexpected character |
| 208 | raise ValueError("tree missing") |
| 209 | |
| 210 | # Handle variations (parenthesized children) |
| 211 | while index < len(content) and content[index] == '(': # Start of a child tree (variation) |
| 212 | # Find the matching closing parenthesis |
| 213 | paren_count = 1 |
| 214 | start = index |
| 215 | index += 1 |
| 216 | while index < len(content) and paren_count > 0: |
| 217 | if content[index] == '(': |
| 218 | paren_count += 1 |
| 219 | elif content[index] == ')': |
| 220 | paren_count -= 1 |
| 221 | index += 1 |
| 222 | |
| 223 | # Edge Case: Malformed parentheses |
| 224 | if paren_count != 0: |
| 225 | raise ValueError("tree missing") |
| 226 | |
| 227 | # Parse the child tree |
| 228 | child_content = content[start:index] |
| 229 | children.append(parse(child_content)) |
| 230 | |
| 231 | return SgfTree(properties=properties, children=children), index |
| 232 | |
| 233 | |
| 234 | def _process_text_value(value: str) -> str: |
| 235 | """ |
| 236 | Process a text value according to SGF text type rules. |
| 237 | |
| 238 | Args: |
| 239 | value: The raw value string |
| 240 | |
| 241 | Returns: |
| 242 | str: The processed value |
| 243 | """ |
| 244 | result = [] |
| 245 | i = 0 |
| 246 | while i < len(value): |
| 247 | if value[i] == '\\': |
| 248 | # Handle escape character |
| 249 | i += 1 |
| 250 | if i < len(value): |
| 251 | if value[i] == '\n': |
| 252 | # Newline immediately after \ is removed |
| 253 | pass # Just skip both the \ and the \n |
| 254 | else: |
| 255 | # Any other character after \ is inserted as-is |
| 256 | # This includes whitespace characters which follow their normal rules |
| 257 | if value[i] in ' \t\r': |
| 258 | # All whitespace other than newline converted to space |
| 259 | result.append(' ') |
| 260 | else: |
| 261 | # Non-whitespace character inserted as-is |
| 262 | result.append(value[i]) |
| 263 | else: |
| 264 | # Backslash at end of string |
| 265 | result.append('\\') |
| 266 | else: |
| 267 | # Handle regular characters |
| 268 | char = value[i] |
| 269 | if char == '\n': |
| 270 | # Newlines remain as newlines |
| 271 | result.append(char) |
| 272 | elif char in ' \t\r': |
| 273 | # All whitespace other than newline converted to space |
| 274 | result.append(' ') |
| 275 | else: |
| 276 | # All other characters inserted as-is |
| 277 | result.append(char) |
| 278 | i += 1 |
| 279 | |
| 280 | return ''.join(result) |
| 281 | |
| 282 | # Handled Edge Cases: Empty input, missing opening parenthesis, missing closing parenthesis, empty tree with no nodes, missing semicolon after opening parenthesis, malformed parentheses, unexpected character, key must be uppercase, empty key, missing opening bracket for value, missing closing bracket, escape at end of string |