| 26 | 26 | |
| 27 | 27 | |
| 28 | 28 | def parse(input_string: str) -> SgfTree: |
| 29 | | pass |
| 29 | """ |
| 30 | Parse an SGF string and return a tree structure of properties. |
| 31 | |
| 32 | Args: |
| 33 | input_string: A string in SGF format |
| 34 | |
| 35 | Returns: |
| 36 | SgfTree: The parsed SGF tree |
| 37 | |
| 38 | Raises: |
| 39 | ValueError: If the input is invalid according to SGF format rules |
| 40 | """ |
| 41 | # Edge Case: Empty input or doesn't start with '(' |
| 42 | if not input_string or not input_string.startswith('('): |
| 43 | raise ValueError("tree missing") |
| 44 | |
| 45 | # Edge Case: Doesn't end with ')' |
| 46 | if not input_string.endswith(')'): |
| 47 | raise ValueError("tree missing") |
| 48 | |
| 49 | # Remove the outer parentheses |
| 50 | content = input_string[1:-1] |
| 51 | |
| 52 | # Edge Case: Empty content after removing parentheses |
| 53 | if not content: |
| 54 | raise ValueError("tree with no nodes") |
| 55 | |
| 56 | # Edge Case: Doesn't start with ';' |
| 57 | if not content.startswith(';'): |
| 58 | raise ValueError("tree with no nodes") |
| 59 | |
| 60 | return _parse_tree(content) |
| 61 | |
| 62 | |
| 63 | def _parse_tree(content: str) -> SgfTree: |
| 64 | """ |
| 65 | Parse a tree from content string. |
| 66 | |
| 67 | Args: |
| 68 | content: Content string starting with ';' |
| 69 | |
| 70 | Returns: |
| 71 | SgfTree: The parsed tree |
| 72 | """ |
| 73 | # Skip the initial ';' |
| 74 | index = 1 |
| 75 | |
| 76 | # Parse properties |
| 77 | properties = {} |
| 78 | while index < len(content) and content[index] != '(' and content[index] != ')': |
| 79 | # If we encounter a semicolon, it means we're starting a new node |
| 80 | # This current node should become the parent of the new node(s) |
| 81 | if content[index] == ';': |
| 82 | break |
| 83 | |
| 84 | # Parse key |
| 85 | key_start = index |
| 86 | while index < len(content) and content[index].isalpha(): |
| 87 | index += 1 |
| 88 | |
| 89 | # Edge Case: No key found |
| 90 | if key_start == index: |
| 91 | break |
| 92 | |
| 93 | # Edge Case: Key is not all uppercase |
| 94 | key = content[key_start:index] |
| 95 | if not key.isupper(): |
| 96 | raise ValueError("property must be in uppercase") |
| 97 | |
| 98 | # Edge Case: Key exists but has no values |
| 99 | if index >= len(content) or content[index] != '[': |
| 100 | raise ValueError("properties without delimiter") |
| 101 | |
| 102 | # Parse values |
| 103 | values = [] |
| 104 | while index < len(content) and content[index] == '[': |
| 105 | index += 1 # Skip '[' |
| 106 | value_start = index |
| 107 | |
| 108 | # Parse value, handling escapes |
| 109 | while index < len(content) and content[index] != ']': |
| 110 | if content[index] == '\\': |
| 111 | index += 2 # Skip escape and next character |
| 112 | else: |
| 113 | index += 1 |
| 114 | |
| 115 | # Edge Case: Unclosed value bracket |
| 116 | if index >= len(content): |
| 117 | raise ValueError("properties without delimiter") |
| 118 | |
| 119 | # Extract and process value |
| 120 | value = _process_text(content[value_start:index]) |
| 121 | values.append(value) |
| 122 | index += 1 # Skip ']' |
| 123 | |
| 124 | # Edge Case: No values found for key |
| 125 | if not values: |
| 126 | raise ValueError("properties without delimiter") |
| 127 | |
| 128 | properties[key] = values |
| 129 | |
| 130 | # Parse children |
| 131 | children = [] |
| 132 | |
| 133 | # First, check if we have a semicolon indicating a direct child node |
| 134 | if index < len(content) and content[index] == ';': |
| 135 | # Parse the rest of the content as a single child node |
| 136 | child_content = content[index:] |
| 137 | children.append(_parse_tree(child_content)) |
| 138 | else: |
| 139 | # Parse variations in parentheses |
| 140 | while index < len(content) and content[index] == '(': |
| 141 | # Find matching parenthesis for this child |
| 142 | paren_count = 1 |
| 143 | start = index |
| 144 | index += 1 |
| 145 | |
| 146 | while index < len(content) and paren_count > 0: |
| 147 | if content[index] == '(': |
| 148 | paren_count += 1 |
| 149 | elif content[index] == ')': |
| 150 | paren_count -= 1 |
| 151 | index += 1 |
| 152 | |
| 153 | # Edge Case: Mismatched parentheses |
| 154 | if paren_count != 0: |
| 155 | raise ValueError("tree missing") |
| 156 | |
| 157 | # Parse the child tree |
| 158 | child_content = content[start+1:index-1] |
| 159 | children.append(_parse_tree(child_content)) |
| 160 | |
| 161 | # Edge Case: No properties and no children |
| 162 | if not properties and not children: |
| 163 | raise ValueError("tree with no nodes") |
| 164 | |
| 165 | return SgfTree(properties, children) |
| 166 | |
| 167 | |
| 168 | def _process_text(text: str) -> str: |
| 169 | """ |
| 170 | Process SGF text according to SGF Text type rules. |
| 171 | |
| 172 | Args: |
| 173 | text: Raw text from SGF property value |
| 174 | |
| 175 | Returns: |
| 176 | str: Processed text |
| 177 | """ |
| 178 | result = [] |
| 179 | i = 0 |
| 180 | while i < len(text): |
| 181 | if text[i] == '\\': |
| 182 | # Handle escape character |
| 183 | i += 1 |
| 184 | if i < len(text): |
| 185 | # Check if next character is a newline |
| 186 | if text[i] == '\n': |
| 187 | # Newline immediately after \ is removed |
| 188 | i += 1 |
| 189 | continue |
| 190 | else: |
| 191 | # Any character after \ is inserted as-is |
| 192 | result.append(text[i]) |
| 193 | i += 1 |
| 194 | else: |
| 195 | # Trailing backslash is just added |
| 196 | result.append('\\') |
| 197 | elif text[i] == '\n': |
| 198 | # Newline not preceded by \ is kept |
| 199 | result.append('\n') |
| 200 | i += 1 |
| 201 | elif text[i] in (' ', '\t', '\r'): |
| 202 | # All whitespace other than newline becomes space |
| 203 | result.append(' ') |
| 204 | i += 1 |
| 205 | else: |
| 206 | # All other characters added as-is |
| 207 | result.append(text[i]) |
| 208 | i += 1 |
| 209 | |
| 210 | return ''.join(result) |
| 211 | # Handled Edge Cases: Empty input, missing opening parenthesis, missing closing parenthesis, empty content after parentheses removal, missing semicolon, key not uppercase, missing delimiter for property values, unclosed value bracket, no values for key, mismatched parentheses |
| 212 | # Handled Edge Cases: Escape character handling, newline handling, whitespace conversion |