| 26 | 26 | |
| 27 | 27 | |
| 28 | 28 | def parse(input_string: str) -> SgfTree: |
| 29 | | pass |
| 29 | """ |
| 30 | Parse an SGF (Smart Game Format) string and return a tree structure of properties. |
| 31 | |
| 32 | Args: |
| 33 | input_string: A string containing SGF formatted data |
| 34 | |
| 35 | Returns: |
| 36 | SgfTree: A tree structure representing the SGF data |
| 37 | |
| 38 | Raises: |
| 39 | ValueError: If the input string is invalid according to SGF format rules |
| 40 | """ |
| 41 | # Edge Case: Empty input or doesn't start with '(' |
| 42 | if not input_string or not input_string.startswith('('): |
| 43 | raise ValueError("tree missing") |
| 44 | |
| 45 | # Edge Case: Doesn't end with ')' |
| 46 | if not input_string.endswith(')'): |
| 47 | raise ValueError("tree missing") |
| 48 | |
| 49 | # Remove the outer parentheses |
| 50 | content = input_string[1:-1] |
| 51 | |
| 52 | # Edge Case: Empty content after removing parentheses |
| 53 | if not content: |
| 54 | raise ValueError("tree with no nodes") |
| 55 | |
| 56 | return _parse_tree(content) |
| 57 | |
| 58 | |
| 59 | def _parse_tree(content: str) -> SgfTree: |
| 60 | """ |
| 61 | Parse the content of a tree (without outer parentheses). |
| 62 | |
| 63 | Args: |
| 64 | content: The content inside the outer parentheses |
| 65 | |
| 66 | Returns: |
| 67 | SgfTree: A tree structure representing the content |
| 68 | """ |
| 69 | # Find the first node |
| 70 | if not content.startswith(';'): |
| 71 | raise ValueError("tree with no nodes") |
| 72 | |
| 73 | # Parse the first node |
| 74 | node_end = 1 # Start after the ';' |
| 75 | properties = {} |
| 76 | |
| 77 | # Parse properties of the first node |
| 78 | while node_end < len(content) and content[node_end] != '(' and content[node_end] != ';': |
| 79 | # Parse key |
| 80 | key_start = node_end |
| 81 | while node_end < len(content) and content[node_end].isalpha(): |
| 82 | node_end += 1 |
| 83 | |
| 84 | # Edge Case: Key is not all uppercase |
| 85 | key = content[key_start:node_end] |
| 86 | if not key.isupper(): |
| 87 | raise ValueError("property must be in uppercase") |
| 88 | |
| 89 | # Edge Case: No key found |
| 90 | if not key: |
| 91 | raise ValueError("properties without delimiter") |
| 92 | |
| 93 | # Parse values |
| 94 | values = [] |
| 95 | while node_end < len(content) and content[node_end] == '[': |
| 96 | node_end += 1 # Skip '[' |
| 97 | value_start = node_end |
| 98 | |
| 99 | # Parse value, handling escapes |
| 100 | while node_end < len(content) and content[node_end] != ']': |
| 101 | if content[node_end] == '\\': |
| 102 | node_end += 2 # Skip escape and next character |
| 103 | else: |
| 104 | node_end += 1 |
| 105 | |
| 106 | # Edge Case: Unclosed bracket |
| 107 | if node_end >= len(content) or content[node_end] != ']': |
| 108 | raise ValueError("properties without delimiter") |
| 109 | |
| 110 | value = content[value_start:node_end] |
| 111 | # Process the value according to SGF Text type rules |
| 112 | value = _process_text(value) |
| 113 | values.append(value) |
| 114 | node_end += 1 # Skip ']' |
| 115 | |
| 116 | # Edge Case: No values for a key |
| 117 | if not values: |
| 118 | raise ValueError("properties without delimiter") |
| 119 | |
| 120 | properties[key] = values |
| 121 | |
| 122 | children = [] |
| 123 | |
| 124 | # Parse children |
| 125 | while node_end < len(content) and content[node_end] == '(': |
| 126 | # Find matching closing parenthesis |
| 127 | paren_count = 1 |
| 128 | child_start = node_end + 1 |
| 129 | node_end += 1 |
| 130 | |
| 131 | while node_end < len(content) and paren_count > 0: |
| 132 | if content[node_end] == '(': |
| 133 | paren_count += 1 |
| 134 | elif content[node_end] == ')': |
| 135 | paren_count -= 1 |
| 136 | node_end += 1 |
| 137 | |
| 138 | # Edge Case: Unmatched parenthesis |
| 139 | if paren_count != 0: |
| 140 | raise ValueError("tree missing") |
| 141 | |
| 142 | child_content = content[child_start:node_end-1] |
| 143 | children.append(_parse_tree(child_content)) |
| 144 | |
| 145 | # Check for additional nodes at the same level |
| 146 | if node_end < len(content) and content[node_end] == ';': |
| 147 | # This is a shorthand - the next node is a child of the current node |
| 148 | remaining_content = content[node_end:] |
| 149 | children.append(_parse_tree(remaining_content)) |
| 150 | |
| 151 | return SgfTree(properties, children) |
| 152 | |
| 153 | |
| 154 | def _process_text(text: str) -> str: |
| 155 | """ |
| 156 | Process text according to SGF Text type rules. |
| 157 | |
| 158 | Args: |
| 159 | text: Raw text value from SGF |
| 160 | |
| 161 | Returns: |
| 162 | str: Processed text |
| 163 | """ |
| 164 | result = [] |
| 165 | i = 0 |
| 166 | while i < len(text): |
| 167 | if text[i] == '\\': |
| 168 | # Handle escape character |
| 169 | i += 1 |
| 170 | if i < len(text): |
| 171 | char = text[i] |
| 172 | if char == 'n' and (i+1 >= len(text) or text[i+1] != ']'): |
| 173 | # Special case: \n not followed by ] is treated as newline |
| 174 | # But newlines are removed if they come immediately after \ |
| 175 | # So we check if there's a newline |
| 176 | if i+1 < len(text) and text[i+1] == '\n': |
| 177 | i += 1 # Skip the actual newline |
| 178 | # Otherwise just add 'n' |
| 179 | result.append(char) |
| 180 | elif char == '\n': |
| 181 | # Newline immediately after \ is removed |
| 182 | pass |
| 183 | elif char.isspace(): |
| 184 | # Whitespace after \ follows normal rules |
| 185 | if char != '\n': |
| 186 | result.append(' ') |
| 187 | else: |
| 188 | # Any non-whitespace character after \ is inserted as-is |
| 189 | result.append(char) |
| 190 | # If escape is at end of string, just ignore it |
| 191 | else: |
| 192 | char = text[i] |
| 193 | if char == '\n': |
| 194 | # Newlines remain as newlines unless immediately after \ |
| 195 | result.append(char) |
| 196 | elif char.isspace() and char != '\n': |
| 197 | # All whitespace other than newline converted to spaces |
| 198 | result.append(' ') |
| 199 | else: |
| 200 | result.append(char) |
| 201 | i += 1 |
| 202 | |
| 203 | return ''.join(result) |
| 204 | # Handled Edge Cases: Empty input, missing opening parenthesis, missing closing parenthesis, empty content after removing parentheses, missing semicolon for node, key not uppercase, no key found, unclosed bracket, no values for key, unmatched parenthesis |
| 205 | # Handled Edge Cases: Key not uppercase, no key found, unclosed bracket, no values for key |
| 206 | # Handled Edge Cases: Empty input or doesn't start with '(', doesn't end with ')', empty content after removing parentheses, missing semicolon for node, unmatched parenthesis, additional nodes at same level |