| 26 | 26 | |
| 27 | 27 | |
| 28 | 28 | def parse(input_string: str) -> SgfTree: |
| 29 | | pass |
| 29 | """ |
| 30 | Parse an SGF (Smart Game Format) string and return a tree structure of properties. |
| 31 | |
| 32 | Args: |
| 33 | input_string: A string containing SGF formatted data |
| 34 | |
| 35 | Returns: |
| 36 | SgfTree: A tree structure representing the parsed SGF data |
| 37 | |
| 38 | Raises: |
| 39 | ValueError: If the input is invalid according to SGF format rules |
| 40 | """ |
| 41 | # Edge Case: Empty input or input that doesn't start with '(' |
| 42 | if not input_string or not input_string.startswith('('): |
| 43 | raise ValueError("tree missing") |
| 44 | |
| 45 | # Edge Case: Input that doesn't end with ')' |
| 46 | if not input_string.endswith(')'): |
| 47 | raise ValueError("tree missing") |
| 48 | |
| 49 | # Remove the outer parentheses |
| 50 | content = input_string[1:-1] |
| 51 | |
| 52 | # Edge Case: Empty content after removing parentheses |
| 53 | if not content: |
| 54 | raise ValueError("tree with no nodes") |
| 55 | |
| 56 | # Edge Case: Content that doesn't start with ';' |
| 57 | if not content.startswith(';'): |
| 58 | raise ValueError("tree with no nodes") |
| 59 | |
| 60 | return _parse_tree(content) |
| 61 | |
| 62 | |
| 63 | def _parse_tree(content: str) -> SgfTree: |
| 64 | """ |
| 65 | Parse a tree from SGF content. |
| 66 | |
| 67 | Args: |
| 68 | content: SGF content string starting with ';' |
| 69 | |
| 70 | Returns: |
| 71 | SgfTree: Parsed tree structure |
| 72 | """ |
| 73 | # Skip the initial ';' |
| 74 | index = 1 |
| 75 | properties = {} |
| 76 | children = [] |
| 77 | |
| 78 | # Parse properties |
| 79 | while index < len(content) and content[index].isalpha(): |
| 80 | # Parse key |
| 81 | key_start = index |
| 82 | while index < len(content) and content[index].isalpha(): |
| 83 | index += 1 |
| 84 | |
| 85 | key = content[key_start:index] |
| 86 | |
| 87 | # Edge Case: Property key is not in uppercase |
| 88 | if not key.isupper(): |
| 89 | raise ValueError("property must be in uppercase") |
| 90 | |
| 91 | # Parse values |
| 92 | values = [] |
| 93 | |
| 94 | # Edge Case: Missing opening bracket for property value |
| 95 | if index >= len(content) or content[index] != '[': |
| 96 | raise ValueError("properties without delimiter") |
| 97 | |
| 98 | while index < len(content) and content[index] == '[': |
| 99 | index += 1 # Skip '[' |
| 100 | value_start = index |
| 101 | |
| 102 | # Parse value, handling escapes |
| 103 | early_break = False |
| 104 | while index < len(content): |
| 105 | # Check if we've reached the end of the value (unescaped ']') |
| 106 | if content[index] == ']': |
| 107 | break |
| 108 | # Check if we've reached the start of a new property |
| 109 | elif index + 1 < len(content) and content[index].isalpha() and content[index + 1] == '[': |
| 110 | # This looks like the start of a new property, so break |
| 111 | # Don't require a closing bracket in this case |
| 112 | early_break = True |
| 113 | break |
| 114 | elif content[index] == '\\': |
| 115 | # Skip escape character and the next character |
| 116 | index += 1 # Move to the escaped character |
| 117 | if index < len(content): |
| 118 | index += 1 # Skip the escaped character |
| 119 | else: |
| 120 | index += 1 |
| 121 | |
| 122 | # Process the value regardless of whether we broke early or not |
| 123 | # The value is from value_start to current index |
| 124 | value = content[value_start:index] |
| 125 | value = _unescape_value(value) |
| 126 | values.append(value) |
| 127 | |
| 128 | # Edge Case: Missing closing bracket (unless we broke early for a new property) |
| 129 | if not early_break: |
| 130 | if index >= len(content) or content[index] != ']': |
| 131 | raise ValueError("properties without delimiter") |
| 132 | index += 1 # Skip ']' |
| 133 | # If we broke early, we don't skip a closing bracket |
| 134 | |
| 135 | properties[key] = values |
| 136 | |
| 137 | # Parse children - both variations (in parentheses) and sequential nodes (separated by semicolons) |
| 138 | while index < len(content): |
| 139 | if content[index] == '(': |
| 140 | # Find matching parenthesis for variation |
| 141 | paren_count = 1 |
| 142 | start = index + 1 |
| 143 | index += 1 |
| 144 | |
| 145 | while index < len(content) and paren_count > 0: |
| 146 | if content[index] == '(': |
| 147 | paren_count += 1 |
| 148 | elif content[index] == ')': |
| 149 | paren_count -= 1 |
| 150 | index += 1 |
| 151 | |
| 152 | # Edge Case: Unmatched parentheses |
| 153 | if paren_count != 0: |
| 154 | raise ValueError("tree missing") |
| 155 | |
| 156 | # Parse child tree |
| 157 | child_content = content[start:index-1] |
| 158 | child_tree = _parse_tree(child_content) |
| 159 | children.append(child_tree) |
| 160 | elif content[index] == ';': |
| 161 | # Parse sequential node |
| 162 | # We need to extract just the part for this node, not the rest |
| 163 | node_end = index + 1 |
| 164 | # Parse properties for this node |
| 165 | while node_end < len(content) and content[node_end].isalpha(): |
| 166 | # Skip property key |
| 167 | while node_end < len(content) and content[node_end].isalpha(): |
| 168 | node_end += 1 |
| 169 | # Skip property values |
| 170 | while node_end < len(content) and content[node_end] == '[': |
| 171 | node_end += 1 # Skip '[' |
| 172 | # Skip until ']' |
| 173 | while node_end < len(content) and content[node_end] != ']': |
| 174 | if content[node_end] == '\\': |
| 175 | node_end += 2 # Skip escape and next character |
| 176 | else: |
| 177 | node_end += 1 |
| 178 | node_end += 1 # Skip ']' |
| 179 | |
| 180 | # Parse child tree for this sequential node |
| 181 | node_content = content[index:node_end] |
| 182 | node_tree = _parse_tree(node_content) |
| 183 | children.append(node_tree) |
| 184 | |
| 185 | # Continue parsing from where this node ended |
| 186 | index = node_end |
| 187 | # If there are more sequential nodes, continue; otherwise break |
| 188 | if index >= len(content) or (content[index] != ';' and content[index] != '('): |
| 189 | break |
| 190 | else: |
| 191 | # No more children |
| 192 | break |
| 193 | |
| 194 | return SgfTree(properties, children) |
| 195 | |
| 196 | |
| 197 | def _unescape_value(value: str) -> str: |
| 198 | """ |
| 199 | Unescape SGF text value according to SGF specification. |
| 200 | |
| 201 | Args: |
| 202 | value: Raw SGF text value |
| 203 | |
| 204 | Returns: |
| 205 | str: Unescaped and processed text value |
| 206 | """ |
| 207 | result = [] |
| 208 | i = 0 |
| 209 | while i < len(value): |
| 210 | if value[i] == '\\': |
| 211 | i += 1 |
| 212 | if i < len(value): |
| 213 | # If next character is newline, remove both \ and newline |
| 214 | if value[i] == '\n': |
| 215 | # Remove the backslash and newline |
| 216 | i += 1 |
| 217 | else: |
| 218 | # Insert the next character as-is |
| 219 | # If it's whitespace, convert to space |
| 220 | if value[i] != '\n' and value[i].isspace(): |
| 221 | result.append(' ') |
| 222 | i += 1 |
| 223 | else: |
| 224 | result.append(value[i]) |
| 225 | i += 1 |
| 226 | else: |
| 227 | # Edge Case: Escape character at end of string |
| 228 | result.append('\\') |
| 229 | else: |
| 230 | # Convert whitespace (except newline) to space |
| 231 | if value[i] != '\n' and value[i].isspace(): |
| 232 | result.append(' ') |
| 233 | else: |
| 234 | result.append(value[i]) |
| 235 | i += 1 |
| 236 | |
| 237 | return ''.join(result) |
| 238 | # Handled Edge Cases: Empty input, missing opening parenthesis, missing closing parenthesis, empty content after parentheses, missing semicolon, property key not uppercase, missing opening bracket, missing closing bracket, unmatched parentheses, escape character at end of string |
| 239 | # Handled Edge Cases: Property key not uppercase, missing opening bracket for property value, missing closing bracket, unmatched parentheses |
| 240 | # Handled Edge Cases: Escape character at end of string, newline removal after backslash, whitespace conversion to space |