| 26 | 26 | |
| 27 | 27 | |
| 28 | 28 | def parse(input_string: str) -> SgfTree: |
| 29 | | pass |
| 29 | """ |
| 30 | Parse an SGF string and return a tree structure of properties. |
| 31 | |
| 32 | Args: |
| 33 | input_string: A string in SGF format |
| 34 | |
| 35 | Returns: |
| 36 | SgfTree: The parsed SGF tree |
| 37 | |
| 38 | Raises: |
| 39 | ValueError: If the input is invalid according to SGF format rules |
| 40 | """ |
| 41 | # Edge Case: Empty input |
| 42 | if not input_string: |
| 43 | raise ValueError("tree missing") |
| 44 | |
| 45 | # Edge Case: Missing opening parenthesis |
| 46 | if not input_string.startswith('('): |
| 47 | raise ValueError("tree missing") |
| 48 | |
| 49 | # Edge Case: Missing closing parenthesis |
| 50 | if not input_string.endswith(')'): |
| 51 | raise ValueError("tree missing") |
| 52 | |
| 53 | # Remove the outer parentheses |
| 54 | content = input_string[1:-1] |
| 55 | |
| 56 | # Edge Case: Empty tree with no nodes |
| 57 | if not content: |
| 58 | raise ValueError("tree with no nodes") |
| 59 | |
| 60 | # Edge Case: Missing semicolon after opening parenthesis |
| 61 | if not content.startswith(';'): |
| 62 | raise ValueError("tree missing") |
| 63 | |
| 64 | return _parse_tree(content) |
| 65 | |
| 66 | |
| 67 | def _parse_tree(content: str) -> SgfTree: |
| 68 | """ |
| 69 | Parse the content of a tree (without the outer parentheses). |
| 70 | |
| 71 | Args: |
| 72 | content: The content of the tree |
| 73 | |
| 74 | Returns: |
| 75 | SgfTree: The parsed tree |
| 76 | """ |
| 77 | # Skip the initial semicolon |
| 78 | index = 1 |
| 79 | |
| 80 | # Parse properties of the root node |
| 81 | properties, index = _parse_properties(content, index) |
| 82 | |
| 83 | # Parse children |
| 84 | children = [] |
| 85 | while index < len(content): |
| 86 | if content[index] == '(': # Start of a child tree |
| 87 | # Find the matching closing parenthesis |
| 88 | paren_count = 1 |
| 89 | start = index |
| 90 | index += 1 |
| 91 | while index < len(content) and paren_count > 0: |
| 92 | if content[index] == '(': |
| 93 | paren_count += 1 |
| 94 | elif content[index] == ')': |
| 95 | paren_count -= 1 |
| 96 | index += 1 |
| 97 | |
| 98 | # Parse the child tree |
| 99 | child_content = content[start:index] |
| 100 | children.append(parse(child_content)) |
| 101 | elif content[index] == ';': # Single child shorthand |
| 102 | # Parse the child node starting from this semicolon |
| 103 | index += 1 # Skip the semicolon |
| 104 | |
| 105 | # Parse properties of the child node |
| 106 | child_properties, index = _parse_properties(content, index) |
| 107 | |
| 108 | # Create child tree with these properties |
| 109 | # For single child shorthand, we need to continue parsing children of this child |
| 110 | child_tree = SgfTree(child_properties) |
| 111 | |
| 112 | # Parse any further children of this child node |
| 113 | temp_children = [] |
| 114 | while index < len(content) and content[index] in '();': |
| 115 | if content[index] == '(': # Variation |
| 116 | # Find the matching closing parenthesis |
| 117 | paren_count = 1 |
| 118 | start = index |
| 119 | index += 1 |
| 120 | while index < len(content) and paren_count > 0: |
| 121 | if content[index] == '(': |
| 122 | paren_count += 1 |
| 123 | elif content[index] == ')': |
| 124 | paren_count -= 1 |
| 125 | index += 1 |
| 126 | |
| 127 | # Parse the child tree |
| 128 | child_content = content[start:index] |
| 129 | temp_children.append(parse(child_content)) |
| 130 | elif content[index] == ';': # Another single child |
| 131 | # For single child shorthand, create a chain |
| 132 | index += 1 # Skip the semicolon |
| 133 | next_properties, index = _parse_properties(content, index) |
| 134 | next_child = SgfTree(next_properties) |
| 135 | temp_children.append(next_child) |
| 136 | else: |
| 137 | index += 1 |
| 138 | |
| 139 | if temp_children: |
| 140 | child_tree.children = temp_children |
| 141 | |
| 142 | children.append(child_tree) |
| 143 | else: |
| 144 | index += 1 |
| 145 | |
| 146 | return SgfTree(properties, children) |
| 147 | |
| 148 | |
| 149 | def _parse_properties(content: str, start_index: int) -> tuple[dict, int]: |
| 150 | """ |
| 151 | Parse properties from the content starting at start_index. |
| 152 | |
| 153 | Args: |
| 154 | content: The content to parse |
| 155 | start_index: The index to start parsing from |
| 156 | |
| 157 | Returns: |
| 158 | tuple: A tuple containing the parsed properties dictionary and the next index |
| 159 | """ |
| 160 | properties = {} |
| 161 | index = start_index |
| 162 | |
| 163 | while index < len(content) and content[index] not in '();': |
| 164 | # Parse key |
| 165 | key_start = index |
| 166 | while index < len(content) and content[index].isalpha(): |
| 167 | index += 1 |
| 168 | |
| 169 | key = content[key_start:index] |
| 170 | |
| 171 | # Edge Case: Property key must be in uppercase |
| 172 | if not key.isupper(): |
| 173 | raise ValueError("property must be in uppercase") |
| 174 | |
| 175 | # Edge Case: Missing property values |
| 176 | if index >= len(content) or content[index] != '[': |
| 177 | raise ValueError("properties without delimiter") |
| 178 | |
| 179 | # Parse values |
| 180 | values = [] |
| 181 | while index < len(content) and content[index] == '[': |
| 182 | index += 1 # Skip opening bracket |
| 183 | value_start = index |
| 184 | |
| 185 | # Find the closing bracket, taking into account escaped brackets |
| 186 | bracket_count = 1 |
| 187 | while index < len(content) and bracket_count > 0: |
| 188 | if content[index] == '\\': |
| 189 | # Skip escape character and the next character |
| 190 | # The escaped character should not affect bracket counting |
| 191 | index += 2 |
| 192 | elif content[index] == '[': |
| 193 | bracket_count += 1 |
| 194 | index += 1 |
| 195 | elif content[index] == ']': |
| 196 | bracket_count -= 1 |
| 197 | index += 1 |
| 198 | else: |
| 199 | index += 1 |
| 200 | |
| 201 | # Edge Case: Unmatched bracket |
| 202 | if bracket_count > 0: |
| 203 | raise ValueError("properties without delimiter") |
| 204 | |
| 205 | value = content[value_start:index-1] # Exclude the closing bracket |
| 206 | values.append(_parse_text(value)) |
| 207 | |
| 208 | properties[key] = values |
| 209 | |
| 210 | return properties, index |
| 211 | |
| 212 | |
| 213 | def _parse_text(text: str) -> str: |
| 214 | """ |
| 215 | Parse SGF Text type according to SGF specification. |
| 216 | |
| 217 | Args: |
| 218 | text: The text to parse |
| 219 | |
| 220 | Returns: |
| 221 | str: The parsed text |
| 222 | """ |
| 223 | result = [] |
| 224 | i = 0 |
| 225 | while i < len(text): |
| 226 | if text[i] == '\\': |
| 227 | # Edge Case: Handle escape sequences |
| 228 | i += 1 |
| 229 | if i < len(text): |
| 230 | char = text[i] |
| 231 | if char == 'n': |
| 232 | # \n is not a special escape sequence in SGF, it's just 'n' |
| 233 | result.append('n') |
| 234 | elif char == 't': |
| 235 | # \t is not a special escape sequence in SGF, it's just 't' |
| 236 | result.append('t') |
| 237 | elif char == '\\': |
| 238 | # Escaped backslash |
| 239 | result.append('\\') |
| 240 | elif char == ']': |
| 241 | # Escaped closing bracket |
| 242 | result.append(']') |
| 243 | elif char == '[': |
| 244 | # Escaped opening bracket |
| 245 | result.append('[') |
| 246 | elif char == ':': |
| 247 | # Escaped colon |
| 248 | result.append(':') |
| 249 | elif char.isspace(): |
| 250 | # Whitespace after backslash |
| 251 | if char == '\n': |
| 252 | # Newlines are removed if they come immediately after a \ |
| 253 | pass |
| 254 | else: |
| 255 | # Other whitespace characters are converted to spaces |
| 256 | result.append(' ') |
| 257 | else: |
| 258 | # Any other character after \ is inserted as-is |
| 259 | result.append(char) |
| 260 | i += 1 |
| 261 | else: |
| 262 | # Edge Case: Trailing backslash |
| 263 | result.append('\\') |
| 264 | i += 1 |
| 265 | else: |
| 266 | char = text[i] |
| 267 | if char == '\n': |
| 268 | # Newlines remain as newlines |
| 269 | result.append(char) |
| 270 | elif char.isspace() and char != '\n': |
| 271 | # All whitespace characters other than newline are converted to spaces |
| 272 | result.append(' ') |
| 273 | else: |
| 274 | result.append(char) |
| 275 | i += 1 |
| 276 | |
| 277 | return ''.join(result) |
| 278 | # Handled Edge Cases: Empty input, Missing opening parenthesis, Missing closing parenthesis, Empty tree with no nodes, Missing semicolon after opening parenthesis, Property key must be in uppercase, Missing property values, Unmatched bracket, Handle escape sequences, Trailing backslash |
| 279 | # Handled Edge Cases: Empty input, Missing opening parenthesis, Missing closing parenthesis, Empty tree with no nodes, Missing semicolon after opening parenthesis |
| 280 | # Handled Edge Cases: Property key must be in uppercase, Missing property values, Unmatched bracket |
| 281 | # Handled Edge Cases: Handle escape sequences, Trailing backslash |