| 26 | 26 | |
| 27 | 27 | |
| 28 | 28 | def parse(input_string: str) -> SgfTree: |
| 29 | | pass |
| 29 | """ |
| 30 | Parse an SGF string and return a tree structure of properties. |
| 31 | |
| 32 | Args: |
| 33 | input_string: A string in SGF format |
| 34 | |
| 35 | Returns: |
| 36 | SgfTree: A tree structure representing the SGF data |
| 37 | |
| 38 | Raises: |
| 39 | ValueError: If the input is invalid according to SGF format rules |
| 40 | """ |
| 41 | # Edge Case: Empty input or input that doesn't start with '(' |
| 42 | if not input_string or not input_string.startswith('('): |
| 43 | raise ValueError("tree missing") |
| 44 | |
| 45 | # Edge Case: Input that doesn't end with ')' |
| 46 | if not input_string.endswith(')'): |
| 47 | raise ValueError("tree missing") |
| 48 | |
| 49 | # Remove the outer parentheses |
| 50 | content = input_string[1:-1] |
| 51 | |
| 52 | # Edge Case: Empty content after removing parentheses |
| 53 | if not content: |
| 54 | raise ValueError("tree with no nodes") |
| 55 | |
| 56 | # Edge Case: Content that doesn't start with ';' |
| 57 | if not content.startswith(';'): |
| 58 | raise ValueError("tree missing") |
| 59 | |
| 60 | return _parse_tree(content) |
| 61 | |
| 62 | |
| 63 | def _parse_tree(content: str) -> SgfTree: |
| 64 | """ |
| 65 | Parse a tree from SGF content. |
| 66 | |
| 67 | Args: |
| 68 | content: SGF content starting with ';' |
| 69 | |
| 70 | Returns: |
| 71 | SgfTree: Parsed tree |
| 72 | """ |
| 73 | # Skip the initial ';' |
| 74 | content = content[1:] |
| 75 | |
| 76 | # Parse properties of the current node |
| 77 | properties, remaining = _parse_properties(content) |
| 78 | |
| 79 | children = [] |
| 80 | |
| 81 | # Parse children if any |
| 82 | while remaining: |
| 83 | if remaining.startswith('('): |
| 84 | # Parse variation (child in parentheses) |
| 85 | # Find the matching closing parenthesis |
| 86 | balance = 1 |
| 87 | i = 1 # Start after the opening '(' |
| 88 | while i < len(remaining) and balance > 0: |
| 89 | if remaining[i] == '(': |
| 90 | balance += 1 |
| 91 | elif remaining[i] == ')': |
| 92 | balance -= 1 |
| 93 | i += 1 |
| 94 | |
| 95 | # Edge Case: Unmatched parentheses |
| 96 | if balance != 0: |
| 97 | raise ValueError("tree missing") |
| 98 | |
| 99 | # Parse the child tree |
| 100 | child_content = remaining[1:i-1] |
| 101 | child_tree = _parse_tree(child_content) |
| 102 | children.append(child_tree) |
| 103 | |
| 104 | # Move to the next potential child |
| 105 | remaining = remaining[i:] |
| 106 | elif remaining.startswith(';'): |
| 107 | # Parse sequential node (child after semicolon) |
| 108 | child_tree = _parse_tree(remaining) |
| 109 | children.append(child_tree) |
| 110 | remaining = '' # All remaining content consumed by child |
| 111 | else: |
| 112 | # Invalid content - not a variation or sequential node |
| 113 | break |
| 114 | |
| 115 | return SgfTree(properties, children) |
| 116 | |
| 117 | |
| 118 | def _parse_properties(content: str) -> tuple[dict, str]: |
| 119 | """ |
| 120 | Parse properties from SGF content. |
| 121 | |
| 122 | Args: |
| 123 | content: SGF content containing properties |
| 124 | |
| 125 | Returns: |
| 126 | tuple: (properties dict, remaining content) |
| 127 | """ |
| 128 | properties = {} |
| 129 | remaining = content |
| 130 | |
| 131 | while remaining and remaining[0].isalpha(): |
| 132 | # Parse property key |
| 133 | key_end = 0 |
| 134 | while key_end < len(remaining) and remaining[key_end].isalpha(): |
| 135 | key_end += 1 |
| 136 | |
| 137 | key = remaining[:key_end] |
| 138 | |
| 139 | # Edge Case: Property key is not uppercase |
| 140 | if key != key.upper(): |
| 141 | raise ValueError("property must be in uppercase") |
| 142 | |
| 143 | remaining = remaining[key_end:] |
| 144 | |
| 145 | # Edge Case: Missing opening bracket for property value |
| 146 | if not remaining.startswith('['): |
| 147 | raise ValueError("properties without delimiter") |
| 148 | |
| 149 | values = [] |
| 150 | |
| 151 | # Parse all values for this key |
| 152 | while remaining.startswith('['): |
| 153 | remaining = remaining[1:] # Skip '[' |
| 154 | |
| 155 | # Parse value |
| 156 | value, remaining = _parse_value(remaining) |
| 157 | |
| 158 | # Edge Case: Missing closing bracket |
| 159 | if not remaining.startswith(']'): |
| 160 | raise ValueError("properties without delimiter") |
| 161 | |
| 162 | remaining = remaining[1:] # Skip ']' |
| 163 | values.append(value) |
| 164 | |
| 165 | properties[key] = values |
| 166 | |
| 167 | return properties, remaining |
| 168 | |
| 169 | |
| 170 | def _parse_value(content: str) -> tuple[str, str]: |
| 171 | """ |
| 172 | Parse a value from SGF content, handling escape sequences. |
| 173 | |
| 174 | Args: |
| 175 | content: SGF content starting with the value |
| 176 | |
| 177 | Returns: |
| 178 | tuple: (parsed value, remaining content) |
| 179 | """ |
| 180 | value = [] |
| 181 | i = 0 |
| 182 | |
| 183 | while i < len(content): |
| 184 | char = content[i] |
| 185 | |
| 186 | if char == ']': |
| 187 | # End of value |
| 188 | break |
| 189 | elif char == '\\': |
| 190 | # Handle escape sequence |
| 191 | i += 1 |
| 192 | if i >= len(content): |
| 193 | # Edge Case: Escape character at end of content |
| 194 | value.append('\\') |
| 195 | break |
| 196 | |
| 197 | next_char = content[i] |
| 198 | if next_char == '\n': |
| 199 | # Newline is removed if immediately after \, but we add a space |
| 200 | value.append(' ') |
| 201 | elif next_char in ' \t\r': |
| 202 | # Whitespace after escape converted to space |
| 203 | value.append(' ') |
| 204 | else: |
| 205 | # Any non-whitespace character after \ is inserted as-is |
| 206 | value.append(next_char) |
| 207 | elif char in ' \t\r': |
| 208 | # All whitespace characters other than newline are converted to spaces |
| 209 | value.append(' ') |
| 210 | elif char == '\n': |
| 211 | # Newlines remain as newlines unless immediately after \ |
| 212 | value.append(char) |
| 213 | else: |
| 214 | # Regular character |
| 215 | value.append(char) |
| 216 | |
| 217 | i += 1 |
| 218 | |
| 219 | return ''.join(value), content[i:] |
| 220 | |
| 221 | # Handled Edge Cases: Empty input, missing opening parenthesis, missing closing parenthesis, empty content after parentheses, missing semicolon, unmatched parentheses, property key not uppercase, missing opening bracket, missing closing bracket, escape character at end of content\n |