| 26 | 26 | |
| 27 | 27 | |
| 28 | 28 | def parse(input_string: str) -> SgfTree: |
| 29 | | pass |
| 29 | """ |
| 30 | Parse an SGF string and return a tree structure of properties. |
| 31 | |
| 32 | Args: |
| 33 | input_string: A string in SGF format |
| 34 | |
| 35 | Returns: |
| 36 | SgfTree: The parsed SGF tree |
| 37 | |
| 38 | Raises: |
| 39 | ValueError: If the input is invalid SGF format |
| 40 | """ |
| 41 | # Edge Case: Empty input |
| 42 | if not input_string: |
| 43 | raise ValueError("tree missing") |
| 44 | |
| 45 | # Edge Case: Missing opening parenthesis |
| 46 | if not input_string.startswith('('): |
| 47 | raise ValueError("tree missing") |
| 48 | |
| 49 | # Edge Case: Missing closing parenthesis |
| 50 | if not input_string.endswith(')'): |
| 51 | raise ValueError("tree missing") |
| 52 | |
| 53 | # Parse the content inside the outermost parentheses |
| 54 | content = input_string[1:-1] |
| 55 | |
| 56 | # Edge Case: Empty tree |
| 57 | if not content: |
| 58 | raise ValueError("tree with no nodes") |
| 59 | |
| 60 | # Edge Case: Missing semicolon after opening parenthesis |
| 61 | if not content.startswith(';'): |
| 62 | raise ValueError("tree with no nodes") |
| 63 | |
| 64 | return _parse_tree(content) |
| 65 | |
| 66 | |
| 67 | def _parse_tree(content: str) -> SgfTree: |
| 68 | """ |
| 69 | Parse a tree from its content string. |
| 70 | |
| 71 | Args: |
| 72 | content: The content of a tree (without outer parentheses) |
| 73 | |
| 74 | Returns: |
| 75 | SgfTree: The parsed tree |
| 76 | """ |
| 77 | # Edge Case: Content doesn't start with semicolon |
| 78 | if not content.startswith(';'): |
| 79 | raise ValueError("tree with no nodes") |
| 80 | |
| 81 | # Find the first node and any children |
| 82 | node_end = 1 # Start after the semicolon |
| 83 | |
| 84 | # Parse properties of the first node |
| 85 | properties = {} |
| 86 | children = [] |
| 87 | print(f"DEBUG: Parsing content: {repr(content)}") |
| 88 | |
| 89 | # Parse properties |
| 90 | while node_end < len(content) and content[node_end] != '(' and content[node_end] != ';': |
| 91 | # Skip whitespace |
| 92 | while node_end < len(content) and content[node_end].isspace(): |
| 93 | node_end += 1 |
| 94 | |
| 95 | # Check if we've reached the end of properties |
| 96 | if node_end >= len(content) or content[node_end] in '();': |
| 97 | break |
| 98 | |
| 99 | # Parse a property |
| 100 | prop_start = node_end |
| 101 | |
| 102 | # Find the property key (must be uppercase) |
| 103 | while node_end < len(content) and content[node_end].isalpha(): |
| 104 | if not content[node_end].isupper(): |
| 105 | raise ValueError("property must be in uppercase") |
| 106 | node_end += 1 |
| 107 | |
| 108 | # Edge Case: No property key found |
| 109 | if node_end == prop_start: |
| 110 | raise ValueError("properties without delimiter") |
| 111 | |
| 112 | key = content[prop_start:node_end] |
| 113 | |
| 114 | # Edge Case: Missing opening bracket |
| 115 | if node_end >= len(content) or content[node_end] != '[': |
| 116 | raise ValueError("properties without delimiter") |
| 117 | |
| 118 | # Parse all values for this key |
| 119 | values = [] |
| 120 | while node_end < len(content) and content[node_end] == '[': |
| 121 | node_end += 1 # Skip opening bracket |
| 122 | value_start = node_end |
| 123 | |
| 124 | # Parse value, handling escapes |
| 125 | print(f"DEBUG: Starting value parsing from position {value_start}") |
| 126 | while node_end < len(content): |
| 127 | print(f"DEBUG: Checking character at position {node_end}: {repr(content[node_end])}") |
| 128 | if content[node_end] == ']': |
| 129 | # Check if this bracket is escaped |
| 130 | # Count consecutive backslashes immediately before this bracket |
| 131 | backslash_count = 0 |
| 132 | check_pos = node_end - 1 |
| 133 | while check_pos >= value_start and content[check_pos] == '\\': |
| 134 | backslash_count += 1 |
| 135 | check_pos -= 1 |
| 136 | print(f"DEBUG: Found ] at position {node_end}, backslash_count={backslash_count}, value_start={value_start}") |
| 137 | print(f"DEBUG: Characters before: {repr(content[value_start:node_end])}") |
| 138 | # If odd number of backslashes, this bracket is escaped |
| 139 | # But an escaped bracket can still serve as the closing bracket |
| 140 | print(f"DEBUG: Found real closing bracket") |
| 141 | break # This is the real closing bracket, whether escaped or not |
| 142 | else: |
| 143 | node_end += 1 |
| 144 | |
| 145 | # Edge Case: Missing closing bracket |
| 146 | if node_end >= len(content) or content[node_end] != ']': |
| 147 | raise ValueError("properties without delimiter") |
| 148 | |
| 149 | value = _parse_value(content[value_start:node_end]) |
| 150 | values.append(value) |
| 151 | node_end += 1 # Skip closing bracket |
| 152 | |
| 153 | properties[key] = values |
| 154 | print(f"DEBUG: Added property {key} with values {values}") |
| 155 | print(f"DEBUG: node_end now at position {node_end}, remaining: {repr(content[node_end:])}") |
| 156 | |
| 157 | # Parse children |
| 158 | while node_end < len(content) and content[node_end] == '(': |
| 159 | # Find the matching closing parenthesis |
| 160 | paren_count = 1 |
| 161 | child_start = node_end |
| 162 | node_end += 1 |
| 163 | |
| 164 | while node_end < len(content) and paren_count > 0: |
| 165 | if content[node_end] == '(': |
| 166 | paren_count += 1 |
| 167 | elif content[node_end] == ')': |
| 168 | paren_count -= 1 |
| 169 | node_end += 1 |
| 170 | |
| 171 | # Edge Case: Mismatched parentheses |
| 172 | if paren_count != 0: |
| 173 | raise ValueError("tree missing") |
| 174 | |
| 175 | # Parse the child tree |
| 176 | child_content = content[child_start+1:node_end-1] |
| 177 | child_tree = _parse_tree(child_content) |
| 178 | children.append(child_tree) |
| 179 | |
| 180 | # Handle consecutive nodes (shorthand notation) |
| 181 | if node_end < len(content) and content[node_end] == ';': |
| 182 | # Parse the next node as a child |
| 183 | remaining_content = content[node_end:] |
| 184 | next_node = _parse_tree(remaining_content) |
| 185 | # If we already have children, this next node becomes a child of the last child |
| 186 | if children: |
| 187 | children[-1] = SgfTree(children[-1].properties, children[-1].children + [next_node]) |
| 188 | else: |
| 189 | children.append(next_node) |
| 190 | |
| 191 | return SgfTree(properties, children) |
| 192 | |
| 193 | |
| 194 | def _parse_value(value_str: str) -> str: |
| 195 | """ |
| 196 | Parse an SGF text value, handling escapes and whitespace. |
| 197 | |
| 198 | Args: |
| 199 | value_str: The raw value string from between brackets |
| 200 | |
| 201 | Returns: |
| 202 | str: The processed value |
| 203 | """ |
| 204 | result = [] |
| 205 | i = 0 |
| 206 | while i < len(value_str): |
| 207 | if value_str[i] == '\\': |
| 208 | # Handle escape sequences |
| 209 | i += 1 |
| 210 | if i < len(value_str): |
| 211 | char = value_str[i] |
| 212 | # According to SGF spec: |
| 213 | # - Newlines are removed if they come immediately after a '\\' |
| 214 | # - All whitespace characters other than newline are converted to spaces |
| 215 | # - '\\' is the escape character. Any non-whitespace character after '\\' is inserted as-is. |
| 216 | # - Any whitespace character after '\\' follows the above rules. |
| 217 | |
| 218 | if char == '\n': |
| 219 | # Newline immediately after backslash is removed |
| 220 | i += 1 |
| 221 | continue |
| 222 | elif char.isspace(): |
| 223 | # Other whitespace after backslash becomes space |
| 224 | result.append(' ') |
| 225 | i += 1 |
| 226 | else: |
| 227 | # Any non-whitespace character after backslash is inserted as-is |
| 228 | result.append(char) |
| 229 | i += 1 |
| 230 | else: |
| 231 | # Trailing backslash |
| 232 | result.append('\\') |
| 233 | i += 1 |
| 234 | else: |
| 235 | char = value_str[i] |
| 236 | if char == '\n': |
| 237 | # Regular newlines are preserved |
| 238 | result.append(char) |
| 239 | elif char.isspace(): |
| 240 | # All other whitespace characters become spaces |
| 241 | result.append(' ') |
| 242 | else: |
| 243 | # Regular character |
| 244 | result.append(char) |
| 245 | i += 1 |
| 246 | |
| 247 | return ''.join(result) |
| 248 | |
| 249 | # Handled Edge Cases: Empty input, missing opening parenthesis, missing closing parenthesis, empty tree, missing semicolon after opening parenthesis, content doesn't start with semicolon, no property key found, missing opening bracket, missing closing bracket, mismatched parentheses, trailing backslash |
| 250 | # Handled Edge Cases: Empty input, missing opening parenthesis, missing closing parenthesis, empty tree, missing semicolon after opening parenthesis, content doesn't start with semicolon, no property key found, missing opening bracket, missing closing bracket, mismatched parentheses |
| 251 | # Handled Edge Cases: Backslash handling, whitespace conversion, newline handling |