| 26 | 26 | |
| 27 | 27 | |
| 28 | 28 | def parse(input_string: str) -> SgfTree: |
| 29 | | pass |
| 29 | """ |
| 30 | Parse an SGF string and return a tree structure of properties. |
| 31 | |
| 32 | Args: |
| 33 | input_string: A string in SGF format |
| 34 | |
| 35 | Returns: |
| 36 | SgfTree: The parsed SGF tree |
| 37 | |
| 38 | Raises: |
| 39 | ValueError: If the input string is not valid SGF |
| 40 | """ |
| 41 | # Edge Case: Empty input |
| 42 | if not input_string: |
| 43 | raise ValueError("tree missing") |
| 44 | |
| 45 | # Edge Case: Missing opening parenthesis |
| 46 | if not input_string.startswith('('): |
| 47 | raise ValueError("tree missing") |
| 48 | |
| 49 | # Edge Case: Missing closing parenthesis |
| 50 | if not input_string.endswith(')'): |
| 51 | raise ValueError("tree missing") |
| 52 | |
| 53 | # Remove the outer parentheses |
| 54 | content = input_string[1:-1] |
| 55 | |
| 56 | # Edge Case: Empty tree |
| 57 | if not content: |
| 58 | raise ValueError("tree with no nodes") |
| 59 | |
| 60 | # Edge Case: Missing semicolon after opening parenthesis |
| 61 | if not content.startswith(';'): |
| 62 | raise ValueError("tree missing") |
| 63 | |
| 64 | return _parse_tree(content) |
| 65 | |
| 66 | |
| 67 | def _parse_tree(content: str) -> SgfTree: |
| 68 | """ |
| 69 | Parse a tree from SGF content. |
| 70 | |
| 71 | Args: |
| 72 | content: SGF content without outer parentheses |
| 73 | |
| 74 | Returns: |
| 75 | SgfTree: The parsed SGF tree |
| 76 | """ |
| 77 | # Skip the initial semicolon |
| 78 | index = 1 |
| 79 | |
| 80 | # Parse properties of the current node |
| 81 | properties, index = _parse_properties(content, index) |
| 82 | |
| 83 | children = [] |
| 84 | |
| 85 | # Parse children |
| 86 | while index < len(content): |
| 87 | if content[index] == '(': # Start of a child tree |
| 88 | child_tree, consumed = _parse_subtree(content[index:]) |
| 89 | children.append(child_tree) |
| 90 | index += consumed |
| 91 | elif content[index] == ';': # Another node in sequence |
| 92 | # This is a shorthand - the next node is a child of the current node |
| 93 | child_tree, consumed = _parse_subtree('(' + content[index:] + ')') |
| 94 | children.append(child_tree) |
| 95 | break # The rest is handled by the child |
| 96 | else: |
| 97 | index += 1 |
| 98 | |
| 99 | return SgfTree(properties, children) |
| 100 | |
| 101 | |
| 102 | def _parse_subtree(content: str) -> tuple[SgfTree, int]: |
| 103 | """ |
| 104 | Parse a subtree from SGF content. |
| 105 | |
| 106 | Args: |
| 107 | content: SGF content starting with '(' |
| 108 | |
| 109 | Returns: |
| 110 | tuple[SgfTree, int]: The parsed subtree and number of characters consumed |
| 111 | """ |
| 112 | # Edge Case: Missing opening parenthesis |
| 113 | if not content.startswith('('): |
| 114 | raise ValueError("tree missing") |
| 115 | |
| 116 | # Find matching closing parenthesis |
| 117 | depth = 1 |
| 118 | index = 1 |
| 119 | |
| 120 | while index < len(content) and depth > 0: |
| 121 | if content[index] == '(': |
| 122 | depth += 1 |
| 123 | elif content[index] == ')': |
| 124 | depth -= 1 |
| 125 | index += 1 |
| 126 | |
| 127 | # Edge Case: Unmatched parenthesis |
| 128 | if depth > 0: |
| 129 | raise ValueError("tree missing") |
| 130 | |
| 131 | # Parse the content inside the parentheses |
| 132 | inner_content = content[1:index-1] |
| 133 | |
| 134 | # Edge Case: Empty subtree |
| 135 | if not inner_content: |
| 136 | raise ValueError("tree with no nodes") |
| 137 | |
| 138 | # Edge Case: Missing semicolon |
| 139 | if not inner_content.startswith(';'): |
| 140 | raise ValueError("tree missing") |
| 141 | |
| 142 | tree = _parse_tree(inner_content) |
| 143 | |
| 144 | return tree, index |
| 145 | |
| 146 | |
| 147 | def _parse_properties(content: str, start_index: int) -> tuple[dict, int]: |
| 148 | """ |
| 149 | Parse properties from SGF content starting at the given index. |
| 150 | |
| 151 | Args: |
| 152 | content: SGF content |
| 153 | start_index: Index to start parsing from |
| 154 | |
| 155 | Returns: |
| 156 | tuple[dict, int]: The parsed properties and the next index |
| 157 | """ |
| 158 | properties = {} |
| 159 | index = start_index |
| 160 | |
| 161 | while index < len(content) and content[index] not in '();': |
| 162 | # Parse property key |
| 163 | key_start = index |
| 164 | |
| 165 | # Edge Case: Key must be uppercase |
| 166 | while index < len(content) and content[index].isalpha(): |
| 167 | if not content[index].isupper(): |
| 168 | raise ValueError("property must be in uppercase") |
| 169 | index += 1 |
| 170 | |
| 171 | # Edge Case: Empty key |
| 172 | if index == key_start: |
| 173 | raise ValueError("properties without delimiter") |
| 174 | |
| 175 | key = content[key_start:index] |
| 176 | |
| 177 | # Edge Case: Missing opening bracket |
| 178 | if index >= len(content) or content[index] != '[': |
| 179 | raise ValueError("properties without delimiter") |
| 180 | |
| 181 | values = [] |
| 182 | |
| 183 | # Parse all values for this key |
| 184 | while index < len(content) and content[index] == '[': |
| 185 | index += 1 # Skip opening bracket |
| 186 | value_start = index |
| 187 | |
| 188 | # Parse value |
| 189 | while index < len(content): |
| 190 | if content[index] == ']': |
| 191 | # Check if this ] is escaped |
| 192 | backslash_count = 0 |
| 193 | check_index = index - 1 |
| 194 | while check_index >= value_start and content[check_index] == '\\': |
| 195 | backslash_count += 1 |
| 196 | check_index -= 1 |
| 197 | # If odd number of backslashes, this ] is escaped |
| 198 | if backslash_count % 2 == 1: |
| 199 | index += 1 |
| 200 | continue |
| 201 | else: |
| 202 | break |
| 203 | else: |
| 204 | index += 1 |
| 205 | |
| 206 | # Edge Case: Missing closing bracket |
| 207 | if index >= len(content) or content[index] != ']': |
| 208 | raise ValueError("properties without delimiter") |
| 209 | |
| 210 | value = _unescape_text(content[value_start:index]) |
| 211 | values.append(value) |
| 212 | index += 1 # Skip closing bracket |
| 213 | |
| 214 | properties[key] = values |
| 215 | |
| 216 | return properties, index |
| 217 | |
| 218 | |
| 219 | def _unescape_text(text: str) -> str: |
| 220 | """ |
| 221 | Unescape SGF text according to SGF specification. |
| 222 | |
| 223 | Args: |
| 224 | text: Escaped SGF text |
| 225 | |
| 226 | Returns: |
| 227 | str: Unescaped text |
| 228 | """ |
| 229 | result = [] |
| 230 | index = 0 |
| 231 | |
| 232 | while index < len(text): |
| 233 | if text[index] == '\\': |
| 234 | index += 1 |
| 235 | if index < len(text): |
| 236 | char = text[index] |
| 237 | # Newlines are removed if they come immediately after a \ |
| 238 | if char == '\n': |
| 239 | pass # Remove the newline |
| 240 | # All whitespace characters other than newline are converted to spaces |
| 241 | elif char in ' \t\r': |
| 242 | result.append(' ') |
| 243 | # Any non-whitespace character after \ is inserted as-is |
| 244 | else: |
| 245 | result.append(char) |
| 246 | else: |
| 247 | char = text[index] |
| 248 | # All whitespace characters other than newline are converted to spaces |
| 249 | if char in ' \t\r': |
| 250 | result.append(' ') |
| 251 | else: |
| 252 | result.append(char) |
| 253 | index += 1 |
| 254 | |
| 255 | return ''.join(result) |
| 256 | # Handled Edge Cases: Empty input, missing opening parenthesis, missing closing parenthesis, empty tree, missing semicolon after opening parenthesis, unmatched parenthesis, empty subtree, missing semicolon, key must be uppercase, empty key, missing opening bracket, missing closing bracket |
| 257 | # Handled Edge Cases: Empty key, missing opening bracket, missing closing bracket |
| 258 | # Handled Edge Cases: Empty input, missing opening parenthesis, missing closing parenthesis, empty tree, missing semicolon after opening parenthesis, unmatched parenthesis, empty subtree, missing semicolon, key must be uppercase |
| 259 | # Handled Edge Cases: Empty input, missing opening parenthesis, missing closing parenthesis, empty tree, missing semicolon after opening parenthesis |