| 26 | 26 | |
| 27 | 27 | |
| 28 | 28 | def parse(input_string: str) -> SgfTree: |
| 29 | | pass |
| 29 | """ |
| 30 | Parse an SGF string and return a tree structure of properties. |
| 31 | |
| 32 | Args: |
| 33 | input_string: A string in SGF format |
| 34 | |
| 35 | Returns: |
| 36 | SgfTree: The parsed SGF tree |
| 37 | |
| 38 | Raises: |
| 39 | ValueError: If the input is invalid according to SGF format rules |
| 40 | """ |
| 41 | # Edge Case: Empty input |
| 42 | if not input_string: |
| 43 | raise ValueError("tree missing") |
| 44 | |
| 45 | # Edge Case: Input doesn't start with '(' |
| 46 | if not input_string.startswith('('): |
| 47 | raise ValueError("tree missing") |
| 48 | |
| 49 | # Edge Case: Input doesn't end with ')' |
| 50 | if not input_string.endswith(')'): |
| 51 | raise ValueError("tree missing") |
| 52 | |
| 53 | # Remove the outer parentheses |
| 54 | content = input_string[1:-1] |
| 55 | |
| 56 | # Edge Case: Empty tree with no nodes |
| 57 | if not content: |
| 58 | raise ValueError("tree with no nodes") |
| 59 | |
| 60 | # Edge Case: Tree doesn't start with ';' |
| 61 | if not content.startswith(';'): |
| 62 | raise ValueError("tree missing") |
| 63 | |
| 64 | return _parse_tree(content) |
| 65 | |
| 66 | |
| 67 | def _parse_tree(content: str) -> SgfTree: |
| 68 | """ |
| 69 | Parse the content of a tree (without outer parentheses). |
| 70 | |
| 71 | Args: |
| 72 | content: The content of the tree |
| 73 | |
| 74 | Returns: |
| 75 | SgfTree: The parsed tree |
| 76 | """ |
| 77 | # Skip the initial ';' |
| 78 | index = 1 |
| 79 | |
| 80 | # Parse properties of the root node |
| 81 | properties, index = _parse_properties(content, index) |
| 82 | |
| 83 | # Parse children |
| 84 | children = [] |
| 85 | |
| 86 | # Handle sequential nodes indicated by semicolons |
| 87 | if index < len(content) and content[index] == ';': |
| 88 | # Create a chain of sequential nodes |
| 89 | current_children = children |
| 90 | while index < len(content) and content[index] == ';': |
| 91 | child_properties, index = _parse_properties(content, index + 1) |
| 92 | child_tree = SgfTree(properties=child_properties) |
| 93 | current_children.append(child_tree) |
| 94 | # For the next iteration, children of this node will be the current_children |
| 95 | current_children = child_tree.children |
| 96 | |
| 97 | # Edge Case: Handle multiple variations |
| 98 | while index < len(content) and content[index] == '(': |
| 99 | child_content, index = _parse_subtree(content, index) |
| 100 | child_tree = _parse_tree(child_content) |
| 101 | children.append(child_tree) |
| 102 | |
| 103 | return SgfTree(properties=properties, children=children) |
| 104 | |
| 105 | |
| 106 | def _parse_subtree(content: str, start_index: int) -> tuple[str, int]: |
| 107 | """ |
| 108 | Parse a subtree enclosed in parentheses. |
| 109 | |
| 110 | Args: |
| 111 | content: The content string |
| 112 | start_index: The starting index where '(' is located |
| 113 | |
| 114 | Returns: |
| 115 | tuple: (subtree_content, next_index) |
| 116 | """ |
| 117 | # Find matching parenthesis |
| 118 | level = 0 |
| 119 | index = start_index |
| 120 | |
| 121 | while index < len(content): |
| 122 | if content[index] == '(': |
| 123 | level += 1 |
| 124 | elif content[index] == ')': |
| 125 | level -= 1 |
| 126 | if level == 0: |
| 127 | # Found matching parenthesis |
| 128 | return content[start_index+1:index], index + 1 |
| 129 | index += 1 |
| 130 | |
| 131 | # This should not happen if input is valid |
| 132 | raise ValueError("tree missing") |
| 133 | |
| 134 | |
| 135 | def _parse_properties(content: str, start_index: int) -> tuple[dict, int]: |
| 136 | """ |
| 137 | Parse properties from the content starting at start_index. |
| 138 | |
| 139 | Args: |
| 140 | content: The content string |
| 141 | start_index: The starting index |
| 142 | |
| 143 | Returns: |
| 144 | tuple: (properties_dict, next_index) |
| 145 | """ |
| 146 | properties = {} |
| 147 | index = start_index |
| 148 | |
| 149 | # Edge Case: Parse multiple properties in the same node |
| 150 | while index < len(content) and content[index].isalpha(): |
| 151 | # Parse key |
| 152 | key_start = index |
| 153 | while index < len(content) and content[index].isalpha(): |
| 154 | index += 1 |
| 155 | |
| 156 | key = content[key_start:index] |
| 157 | |
| 158 | # Edge Case: Property must be in uppercase |
| 159 | if not key.isupper(): |
| 160 | raise ValueError("property must be in uppercase") |
| 161 | |
| 162 | # Edge Case: Properties without delimiter |
| 163 | if index >= len(content) or content[index] != '[': |
| 164 | raise ValueError("properties without delimiter") |
| 165 | |
| 166 | # Parse values |
| 167 | values = [] |
| 168 | |
| 169 | # Edge Case: Parse multiple values for the same key |
| 170 | while index < len(content) and content[index] == '[': |
| 171 | index += 1 # Skip '[' |
| 172 | value_start = index |
| 173 | |
| 174 | # Parse value, handling escapes |
| 175 | value_chars = [] |
| 176 | while index < len(content): |
| 177 | if content[index] == ']': |
| 178 | # End of value |
| 179 | break |
| 180 | elif content[index] == '\\': |
| 181 | # Handle escape character |
| 182 | index += 1 |
| 183 | if index < len(content): |
| 184 | if content[index] == ']': |
| 185 | # Special case: escaped closing bracket becomes literal backslash |
| 186 | value_chars.append('\\') |
| 187 | else: |
| 188 | # Add the escaped character to the value |
| 189 | value_chars.append(content[index]) |
| 190 | index += 1 |
| 191 | else: |
| 192 | value_chars.append(content[index]) |
| 193 | index += 1 |
| 194 | |
| 195 | value = ''.join(value_chars) |
| 196 | value = _unescape_text(value) |
| 197 | values.append(value) |
| 198 | index += 1 # Skip ']' |
| 199 | |
| 200 | properties[key] = values |
| 201 | |
| 202 | # If we encounter a ';' or '(', stop parsing properties |
| 203 | if index < len(content) and (content[index] == ';' or content[index] == '('): |
| 204 | break |
| 205 | |
| 206 | return properties, index |
| 207 | |
| 208 | |
| 209 | def _unescape_text(text: str) -> str: |
| 210 | """ |
| 211 | Unescape SGF text according to SGF specification. |
| 212 | |
| 213 | Args: |
| 214 | text: The escaped text |
| 215 | |
| 216 | Returns: |
| 217 | str: The unescaped text |
| 218 | """ |
| 219 | result = [] |
| 220 | i = 0 |
| 221 | |
| 222 | while i < len(text): |
| 223 | if text[i] == '\\': |
| 224 | i += 1 # Skip the escape character |
| 225 | |
| 226 | # Edge Case: Handle escape at end of string |
| 227 | if i >= len(text): |
| 228 | # Trailing escape character is just ignored |
| 229 | break |
| 230 | |
| 231 | char = text[i] |
| 232 | |
| 233 | # Edge Case: Newline removal after escape |
| 234 | if char == '\n': |
| 235 | # Newline is removed if immediately after backslash |
| 236 | pass |
| 237 | # Edge Case: Whitespace conversion |
| 238 | elif char.isspace() and char != '\n': |
| 239 | # Other whitespace converted to space |
| 240 | result.append(' ') |
| 241 | else: |
| 242 | # Edge Case: Any non-whitespace character after escape is inserted as-is |
| 243 | result.append(char) |
| 244 | else: |
| 245 | char = text[i] |
| 246 | # Edge Case: Convert non-newline whitespace to spaces |
| 247 | if char.isspace() and char != '\n': |
| 248 | result.append(' ') |
| 249 | else: |
| 250 | result.append(char) |
| 251 | |
| 252 | i += 1 |
| 253 | |
| 254 | return ''.join(result) |
| 255 | # Handled Edge Cases: Empty input, Input doesn't start with '(', Input doesn't end with ')', Empty tree with no nodes, Tree doesn't start with ';', Handle multiple variations, Unclosed bracket, Property must be in uppercase, Properties without delimiter, Parse multiple values for the same key, Parse multiple properties in the same node, Escape at end of string, Newline removal after escape, Whitespace conversion, Any non-whitespace character after escape is inserted as-is, Convert non-newline whitespace to spaces |