| 26 | 26 | |
| 27 | 27 | |
| 28 | 28 | def parse(input_string: str) -> SgfTree: |
| 29 | | pass |
| 29 | """ |
| 30 | Parse an SGF string and return a tree structure of properties. |
| 31 | |
| 32 | Args: |
| 33 | input_string: A string containing SGF formatted data |
| 34 | |
| 35 | Returns: |
| 36 | SgfTree: A tree structure representing the SGF data |
| 37 | |
| 38 | Raises: |
| 39 | ValueError: If the input is invalid according to SGF format rules |
| 40 | """ |
| 41 | # Edge Case: Empty input or input that doesn't start with '(' |
| 42 | if not input_string or not input_string.startswith('('): |
| 43 | raise ValueError("tree missing") |
| 44 | |
| 45 | # Edge Case: Input that doesn't end with ')' |
| 46 | if not input_string.endswith(')'): |
| 47 | raise ValueError("tree missing") |
| 48 | |
| 49 | # Remove the outer parentheses |
| 50 | content = input_string[1:-1] |
| 51 | |
| 52 | # Edge Case: Empty content after removing parentheses |
| 53 | if not content: |
| 54 | raise ValueError("tree with no nodes") |
| 55 | |
| 56 | # Edge Case: Content that doesn't start with ';' |
| 57 | if not content.startswith(';'): |
| 58 | raise ValueError("tree with no nodes") |
| 59 | |
| 60 | return _parse_tree(content) |
| 61 | |
| 62 | |
| 63 | def _parse_tree(content: str) -> SgfTree: |
| 64 | """ |
| 65 | Parse a tree from SGF content. |
| 66 | |
| 67 | Args: |
| 68 | content: SGF content string starting with ';' |
| 69 | |
| 70 | Returns: |
| 71 | SgfTree: Parsed tree structure |
| 72 | """ |
| 73 | # Skip the initial ';' |
| 74 | index = 1 |
| 75 | properties = {} |
| 76 | children = [] |
| 77 | |
| 78 | # Parse properties |
| 79 | while index < len(content) and content[index] not in '();': |
| 80 | # Parse property key |
| 81 | key_start = index |
| 82 | while index < len(content) and content[index].isalpha() and content[index].isupper(): |
| 83 | index += 1 |
| 84 | |
| 85 | # Edge Case: Property key is not in uppercase |
| 86 | if index == key_start: |
| 87 | raise ValueError("property must be in uppercase") |
| 88 | |
| 89 | key = content[key_start:index] |
| 90 | |
| 91 | # Edge Case: No delimiter after key |
| 92 | if index >= len(content) or content[index] != '[': |
| 93 | raise ValueError("properties without delimiter") |
| 94 | |
| 95 | # Parse property values |
| 96 | values = [] |
| 97 | while index < len(content) and content[index] == '[': |
| 98 | index += 1 # Skip '[' |
| 99 | value_start = index |
| 100 | |
| 101 | # Parse value, handling escapes |
| 102 | while index < len(content) and content[index] != ']': |
| 103 | if content[index] == '\\': |
| 104 | index += 2 # Skip escape and next character |
| 105 | else: |
| 106 | index += 1 |
| 107 | |
| 108 | # Edge Case: Unclosed value bracket |
| 109 | if index >= len(content): |
| 110 | raise ValueError("properties without delimiter") |
| 111 | |
| 112 | value = _unescape_text(content[value_start:index]) |
| 113 | values.append(value) |
| 114 | index += 1 # Skip ']' |
| 115 | |
| 116 | properties[key] = values |
| 117 | |
| 118 | # Parse children |
| 119 | while index < len(content) and content[index] in '();': |
| 120 | if content[index] == '(': # Start of a variation |
| 121 | # Find matching parenthesis |
| 122 | paren_count = 1 |
| 123 | child_start = index |
| 124 | index += 1 |
| 125 | |
| 126 | while index < len(content) and paren_count > 0: |
| 127 | if content[index] == '(': |
| 128 | paren_count += 1 |
| 129 | elif content[index] == ')': |
| 130 | paren_count -= 1 |
| 131 | index += 1 |
| 132 | |
| 133 | # Edge Case: Unmatched parenthesis |
| 134 | if paren_count != 0: |
| 135 | raise ValueError("tree missing") |
| 136 | |
| 137 | child_content = content[child_start:index] |
| 138 | children.append(_parse_tree(child_content[1:-1])) |
| 139 | elif content[index] == ';': # Direct child node |
| 140 | # Parse just this single node |
| 141 | child_start = index |
| 142 | index += 1 |
| 143 | |
| 144 | # Parse properties of this child node |
| 145 | while index < len(content) and content[index] not in '();': |
| 146 | # Parse key |
| 147 | key_start = index |
| 148 | while index < len(content) and content[index].isalpha() and content[index].isupper(): |
| 149 | index += 1 |
| 150 | |
| 151 | # Edge Case: Property key is not in uppercase |
| 152 | if index == key_start: |
| 153 | raise ValueError("property must be in uppercase") |
| 154 | |
| 155 | # Edge Case: No delimiter after key |
| 156 | if index >= len(content) or content[index] != '[': |
| 157 | raise ValueError("properties without delimiter") |
| 158 | |
| 159 | # Parse values |
| 160 | while index < len(content) and content[index] == '[': |
| 161 | index += 1 # Skip '[' |
| 162 | # Skip until ']' |
| 163 | while index < len(content) and content[index] != ']': |
| 164 | if content[index] == '\\': |
| 165 | index += 2 |
| 166 | else: |
| 167 | index += 1 |
| 168 | index += 1 # Skip ']' |
| 169 | |
| 170 | child_content = content[child_start:index] |
| 171 | child_node = _parse_tree(child_content) |
| 172 | |
| 173 | # If there are more nodes in sequence, handle them properly |
| 174 | if index < len(content) and content[index] == ';': |
| 175 | # For sequential nodes, we need to build a chain |
| 176 | # Parse the remaining nodes one by one |
| 177 | current_parent = child_node |
| 178 | while index < len(content) and content[index] == ';': |
| 179 | # Parse the next node |
| 180 | node_start = index |
| 181 | index += 1 |
| 182 | |
| 183 | # Parse properties of this node |
| 184 | while index < len(content) and content[index] not in '();': |
| 185 | # Parse key |
| 186 | key_start = index |
| 187 | while index < len(content) and content[index].isalpha() and content[index].isupper(): |
| 188 | index += 1 |
| 189 | |
| 190 | # Edge Case: Property key is not in uppercase |
| 191 | if index == key_start: |
| 192 | raise ValueError("property must be in uppercase") |
| 193 | |
| 194 | # Edge Case: No delimiter after key |
| 195 | if index >= len(content) or content[index] != '[': |
| 196 | raise ValueError("properties without delimiter") |
| 197 | |
| 198 | # Parse values |
| 199 | while index < len(content) and content[index] == '[': |
| 200 | index += 1 # Skip '[' |
| 201 | # Skip until ']' |
| 202 | while index < len(content) and content[index] != ']': |
| 203 | if content[index] == '\\': |
| 204 | index += 2 |
| 205 | else: |
| 206 | index += 1 |
| 207 | index += 1 # Skip ']' |
| 208 | |
| 209 | node_content = content[node_start:index] |
| 210 | node = _parse_tree(node_content) |
| 211 | current_parent.children = [node] |
| 212 | current_parent = node |
| 213 | |
| 214 | children.append(child_node) |
| 215 | else: # content[index] == ')' |
| 216 | break |
| 217 | |
| 218 | return SgfTree(properties, children) |
| 219 | |
| 220 | |
| 221 | def _unescape_text(text: str) -> str: |
| 222 | """ |
| 223 | Process SGF text according to SGF text type rules. |
| 224 | |
| 225 | Args: |
| 226 | text: Raw text from SGF property value |
| 227 | |
| 228 | Returns: |
| 229 | str: Processed text with escape sequences handled |
| 230 | """ |
| 231 | result = [] |
| 232 | i = 0 |
| 233 | while i < len(text): |
| 234 | if text[i] == '\\': |
| 235 | i += 1 |
| 236 | if i < len(text): |
| 237 | # If next character is newline, remove both \ and newline |
| 238 | if text[i] == '\n': |
| 239 | i += 1 |
| 240 | continue |
| 241 | # Any character after \ is inserted as-is |
| 242 | # Special case: if the character is ']', we need to double it in the output |
| 243 | if text[i] == ']': |
| 244 | result.append(']]') |
| 245 | else: |
| 246 | result.append(text[i]) |
| 247 | i += 1 |
| 248 | else: |
| 249 | # Trailing backslash - should not happen in valid SGF but we handle it |
| 250 | result.append('\\') |
| 251 | i += 1 |
| 252 | else: |
| 253 | # Convert all whitespace (except newline) to spaces |
| 254 | if text[i] != '\n' and text[i].isspace(): |
| 255 | result.append(' ') |
| 256 | else: |
| 257 | result.append(text[i]) |
| 258 | i += 1 |
| 259 | |
| 260 | return ''.join(result) |
| 261 | # Handled Edge Cases: Empty input, missing opening parenthesis, missing closing parenthesis, empty content after parentheses removal, missing semicolon, property key not in uppercase, missing delimiter after key, unclosed value bracket, unmatched parenthesis, trailing backslash |