| 26 | 26 | |
| 27 | 27 | |
| 28 | 28 | def parse(input_string: str) -> SgfTree: |
| 29 | | pass |
| 29 | """ |
| 30 | Parse an SGF string and return a tree structure of properties. |
| 31 | |
| 32 | Args: |
| 33 | input_string: A string in SGF format |
| 34 | |
| 35 | Returns: |
| 36 | SgfTree: The parsed SGF tree |
| 37 | |
| 38 | Raises: |
| 39 | ValueError: If the input is invalid according to SGF format rules |
| 40 | """ |
| 41 | # Edge Case: Empty input |
| 42 | if not input_string: |
| 43 | raise ValueError("tree missing") |
| 44 | |
| 45 | # Edge Case: Missing opening parenthesis |
| 46 | if not input_string.startswith('('): |
| 47 | raise ValueError("tree missing") |
| 48 | |
| 49 | # Edge Case: Missing closing parenthesis |
| 50 | if not input_string.endswith(')'): |
| 51 | raise ValueError("tree missing") |
| 52 | |
| 53 | # Remove the outer parentheses |
| 54 | content = input_string[1:-1] |
| 55 | |
| 56 | # Edge Case: Empty tree with no nodes |
| 57 | if not content: |
| 58 | raise ValueError("tree with no nodes") |
| 59 | |
| 60 | # Edge Case: Missing semicolon after opening parenthesis |
| 61 | if not content.startswith(';'): |
| 62 | raise ValueError("tree missing") |
| 63 | |
| 64 | return _parse_tree(content) |
| 65 | |
| 66 | |
| 67 | def _parse_tree(content: str) -> SgfTree: |
| 68 | """ |
| 69 | Parse the content of a tree (without outer parentheses). |
| 70 | |
| 71 | Args: |
| 72 | content: The content of the tree |
| 73 | |
| 74 | Returns: |
| 75 | SgfTree: The parsed tree |
| 76 | """ |
| 77 | # Skip the initial semicolon |
| 78 | if not content.startswith(';'): |
| 79 | raise ValueError("tree missing") |
| 80 | content = content[1:] |
| 81 | |
| 82 | # Parse properties of the current node |
| 83 | properties, remaining = _parse_properties(content) |
| 84 | |
| 85 | children = [] |
| 86 | |
| 87 | # Parse children |
| 88 | while remaining: |
| 89 | if remaining.startswith('('): |
| 90 | # Parse a subtree |
| 91 | subtree_content, remaining = _parse_subtree(remaining) |
| 92 | child = _parse_tree(subtree_content) |
| 93 | children.append(child) |
| 94 | elif remaining.startswith(';'): |
| 95 | # Parse a direct child node (shorthand notation) |
| 96 | child_content = ';' |
| 97 | remaining = remaining[1:] |
| 98 | props, remaining_props = _parse_properties(remaining) |
| 99 | # The content for this child is everything from the start of remaining up to remaining_props |
| 100 | consumed_length = len(remaining) - len(remaining_props) |
| 101 | child_content += remaining[:consumed_length] |
| 102 | remaining = remaining_props |
| 103 | child = _parse_tree(child_content) |
| 104 | children.append(child) |
| 105 | else: |
| 106 | # No more children |
| 107 | break |
| 108 | |
| 109 | return SgfTree(properties, children) |
| 110 | |
| 111 | |
| 112 | def _parse_properties(content: str) -> tuple[dict, str]: |
| 113 | """ |
| 114 | Parse properties from the beginning of the content. |
| 115 | |
| 116 | Args: |
| 117 | content: String content to parse properties from |
| 118 | |
| 119 | Returns: |
| 120 | tuple: (properties dict, remaining content) |
| 121 | """ |
| 122 | properties = {} |
| 123 | remaining = content |
| 124 | |
| 125 | while remaining and remaining[0].isalpha(): |
| 126 | # Check if property is uppercase |
| 127 | if not remaining[0].isupper(): |
| 128 | raise ValueError("property must be in uppercase") |
| 129 | |
| 130 | # Parse key |
| 131 | key_end = 0 |
| 132 | while key_end < len(remaining) and remaining[key_end].isalpha() and remaining[key_end].isupper(): |
| 133 | key_end += 1 |
| 134 | |
| 135 | # Edge Case: Key without values |
| 136 | if key_end >= len(remaining) or remaining[key_end] != '[': |
| 137 | raise ValueError("properties without delimiter") |
| 138 | |
| 139 | key = remaining[:key_end] |
| 140 | remaining = remaining[key_end:] |
| 141 | |
| 142 | # Parse values |
| 143 | values = [] |
| 144 | while remaining.startswith('['): |
| 145 | remaining = remaining[1:] # Skip opening bracket |
| 146 | value, remaining = _parse_value(remaining) |
| 147 | # Edge Case: Missing closing bracket |
| 148 | if not remaining.startswith(']'): |
| 149 | raise ValueError("properties without delimiter") |
| 150 | remaining = remaining[1:] # Skip closing bracket |
| 151 | values.append(value) |
| 152 | |
| 153 | # Edge Case: No values for property |
| 154 | if not values: |
| 155 | raise ValueError("properties without delimiter") |
| 156 | |
| 157 | properties[key] = values |
| 158 | |
| 159 | return properties, remaining |
| 160 | |
| 161 | |
| 162 | def _parse_value(content: str) -> tuple[str, str]: |
| 163 | """ |
| 164 | Parse a value from SGF Text type. |
| 165 | |
| 166 | Args: |
| 167 | content: String content to parse value from |
| 168 | |
| 169 | Returns: |
| 170 | tuple: (parsed value, remaining content) |
| 171 | """ |
| 172 | value = '' |
| 173 | i = 0 |
| 174 | |
| 175 | while i < len(content) and content[i] != ']': |
| 176 | if content[i] == '\\': |
| 177 | # Handle escape character |
| 178 | i += 1 |
| 179 | if i >= len(content): |
| 180 | # Edge Case: Escape at end of content |
| 181 | value += '\\' |
| 182 | break |
| 183 | |
| 184 | char = content[i] |
| 185 | if char == '\n': |
| 186 | # Newline immediately after backslash is removed |
| 187 | pass |
| 188 | elif char.isspace(): |
| 189 | # Whitespace (including tabs, spaces, etc.) after backslash follows the above rules |
| 190 | # All whitespace other than newline is converted to space |
| 191 | value += ' ' |
| 192 | else: |
| 193 | # Any non-whitespace character after backslash is inserted as-is |
| 194 | # But for closing bracket, the backslash escapes it |
| 195 | if char == ']': |
| 196 | value += ']' |
| 197 | elif char == 'n': |
| 198 | # \n becomes actual newline character |
| 199 | value += '\n' |
| 200 | elif char == 't': |
| 201 | # \t becomes actual tab character |
| 202 | value += '\t' |
| 203 | elif char == 'r': |
| 204 | # \r becomes actual carriage return character |
| 205 | value += '\r' |
| 206 | else: |
| 207 | # For other characters, SGF does not have escape sequences |
| 208 | # These are just literal backslash followed by the character |
| 209 | value += '\\' + char |
| 210 | elif content[i] == '\n': |
| 211 | # Newlines are kept unless immediately after backslash |
| 212 | value += content[i] |
| 213 | elif content[i].isspace(): |
| 214 | # All whitespace other than newline is converted to space |
| 215 | value += ' ' |
| 216 | else: |
| 217 | value += content[i] |
| 218 | i += 1 |
| 219 | |
| 220 | return value, content[i:] |
| 221 | |
| 222 | |
| 223 | def _parse_subtree(content: str) -> tuple[str, str]: |
| 224 | """ |
| 225 | Parse a subtree including its parentheses. |
| 226 | |
| 227 | Args: |
| 228 | content: String content starting with '(' |
| 229 | |
| 230 | Returns: |
| 231 | tuple: (subtree content including parentheses, remaining content) |
| 232 | """ |
| 233 | # Edge Case: Not starting with '(' |
| 234 | if not content.startswith('('): |
| 235 | raise ValueError("tree missing") |
| 236 | |
| 237 | balance = 0 |
| 238 | i = 0 |
| 239 | |
| 240 | for i, char in enumerate(content): |
| 241 | if char == '(': |
| 242 | balance += 1 |
| 243 | elif char == ')': |
| 244 | balance -= 1 |
| 245 | if balance == 0: |
| 246 | # Found the matching closing parenthesis |
| 247 | return content[1:i], content[i+1:] |
| 248 | |
| 249 | # Edge Case: Unmatched parentheses |
| 250 | raise ValueError("tree missing") |
| 251 | # Handled Edge Cases: Empty input, Missing opening parenthesis, Missing closing parenthesis, Empty tree with no nodes, Missing semicolon after opening parenthesis, Key without values, Missing closing bracket, No values for property, Escape at end of content, Unmatched parentheses |
| 252 | # Handled Edge Cases: Key without values, Missing closing bracket, No values for property |
| 253 | # Handled Edge Cases: Escape at end of content |
| 254 | # Handled Edge Cases: Not starting with '(', Unmatched parentheses |