| 26 | 26 | |
| 27 | 27 | |
| 28 | 28 | def parse(input_string: str) -> SgfTree: |
| 29 | | pass |
| 29 | """ |
| 30 | Parse an SGF (Smart Game Format) string and return a tree structure of properties. |
| 31 | |
| 32 | Args: |
| 33 | input_string: A string containing SGF formatted data |
| 34 | |
| 35 | Returns: |
| 36 | SgfTree: A tree structure representing the SGF data |
| 37 | |
| 38 | Raises: |
| 39 | ValueError: If the input is malformed or doesn't conform to SGF format |
| 40 | """ |
| 41 | # Edge Case: Empty input or doesn't start with '(' |
| 42 | if not input_string or not input_string.startswith('('): |
| 43 | raise ValueError("tree missing") |
| 44 | |
| 45 | # Edge Case: Doesn't end with ')' |
| 46 | if not input_string.endswith(')'): |
| 47 | raise ValueError("tree missing") |
| 48 | |
| 49 | # Remove the outer parentheses |
| 50 | content = input_string[1:-1] |
| 51 | |
| 52 | # Edge Case: Empty content after removing parentheses |
| 53 | if not content: |
| 54 | raise ValueError("tree with no nodes") |
| 55 | |
| 56 | return _parse_tree(content) |
| 57 | |
| 58 | |
| 59 | def _parse_tree(content: str) -> SgfTree: |
| 60 | """ |
| 61 | Parse the content of a tree (without outer parentheses). |
| 62 | """ |
| 63 | # Find the first node |
| 64 | if not content.startswith(';'): |
| 65 | raise ValueError("tree with no nodes") |
| 66 | |
| 67 | # Find where the first node ends and children begin |
| 68 | node_end = 1 # Start after the ';' |
| 69 | |
| 70 | # Parse properties of the first node |
| 71 | properties = {} |
| 72 | children = [] |
| 73 | |
| 74 | # Parse properties |
| 75 | while node_end < len(content) and content[node_end] != '(' and content[node_end] != ';': |
| 76 | # Parse key |
| 77 | key_start = node_end |
| 78 | while node_end < len(content) and content[node_end].isalpha(): |
| 79 | node_end += 1 |
| 80 | |
| 81 | key = content[key_start:node_end] |
| 82 | |
| 83 | # Edge Case: Property key is not uppercase |
| 84 | if not key.isupper(): |
| 85 | raise ValueError("property must be in uppercase") |
| 86 | |
| 87 | # Edge Case: Key without values |
| 88 | if node_end >= len(content) or content[node_end] != '[': |
| 89 | raise ValueError("properties without delimiter") |
| 90 | |
| 91 | # Parse values |
| 92 | values = [] |
| 93 | while node_end < len(content) and content[node_end] == '[': |
| 94 | node_end += 1 # Skip '[' |
| 95 | value_start = node_end |
| 96 | |
| 97 | # Find the matching ']' |
| 98 | bracket_count = 1 |
| 99 | while node_end < len(content) and bracket_count > 0: |
| 100 | if content[node_end] == '\\': |
| 101 | node_end += 2 # Skip escape character and the next character |
| 102 | elif content[node_end] == '[': |
| 103 | bracket_count += 1 |
| 104 | node_end += 1 |
| 105 | elif content[node_end] == ']': |
| 106 | bracket_count -= 1 |
| 107 | node_end += 1 |
| 108 | else: |
| 109 | node_end += 1 |
| 110 | |
| 111 | # Edge Case: Unmatched bracket |
| 112 | if bracket_count > 0: |
| 113 | raise ValueError("properties without delimiter") |
| 114 | |
| 115 | value = content[value_start:node_end-1] |
| 116 | values.append(_unescape_text(value)) |
| 117 | |
| 118 | properties[key] = values |
| 119 | |
| 120 | # Parse children |
| 121 | while node_end < len(content) and content[node_end] == '(': |
| 122 | # Find the matching ')' |
| 123 | paren_count = 1 |
| 124 | child_start = node_end + 1 |
| 125 | node_end += 1 |
| 126 | |
| 127 | while node_end < len(content) and paren_count > 0: |
| 128 | if content[node_end] == '\\': |
| 129 | node_end += 2 # Skip escape character and the next character |
| 130 | elif content[node_end] == '(': |
| 131 | paren_count += 1 |
| 132 | node_end += 1 |
| 133 | elif content[node_end] == ')': |
| 134 | paren_count -= 1 |
| 135 | node_end += 1 |
| 136 | else: |
| 137 | node_end += 1 |
| 138 | |
| 139 | # Edge Case: Unmatched parenthesis |
| 140 | if paren_count > 0: |
| 141 | raise ValueError("tree missing") |
| 142 | |
| 143 | child_content = content[child_start:node_end-1] |
| 144 | children.append(_parse_tree(child_content)) |
| 145 | |
| 146 | # Handle consecutive nodes (shorthand notation) |
| 147 | if node_end < len(content) and content[node_end] == ';': |
| 148 | # Create a child node with the rest of the content |
| 149 | child_content = content[node_end:] |
| 150 | children.append(_parse_tree(child_content)) |
| 151 | |
| 152 | return SgfTree(properties, children) |
| 153 | |
| 154 | |
| 155 | def _unescape_text(text: str) -> str: |
| 156 | """ |
| 157 | Process SGF text according to SGF specification: |
| 158 | - Newlines are removed if they come immediately after a '\' |
| 159 | - All whitespace characters other than newline are converted to spaces |
| 160 | - '\' is the escape character |
| 161 | """ |
| 162 | result = [] |
| 163 | i = 0 |
| 164 | while i < len(text): |
| 165 | if text[i] == '\\': |
| 166 | # Edge Case: Handle escape sequences |
| 167 | i += 1 |
| 168 | if i < len(text): |
| 169 | # If next character is a newline, remove it |
| 170 | if text[i] == '\n': |
| 171 | i += 1 # Just skip the newline |
| 172 | # If next character is whitespace, convert to space |
| 173 | elif text[i].isspace() and text[i] != '\n': |
| 174 | result.append(' ') |
| 175 | i += 1 |
| 176 | # Otherwise, insert as-is |
| 177 | else: |
| 178 | result.append(text[i]) |
| 179 | i += 1 |
| 180 | # Edge Case: Escape at end of string |
| 181 | else: |
| 182 | result.append('\\') |
| 183 | else: |
| 184 | # Convert whitespace (other than newline) to spaces |
| 185 | if text[i].isspace() and text[i] != '\n': |
| 186 | result.append(' ') |
| 187 | else: |
| 188 | result.append(text[i]) |
| 189 | i += 1 |
| 190 | |
| 191 | return ''.join(result) |
| 192 | # Handled Edge Cases: Empty input, missing start parenthesis, missing end parenthesis, empty content after removing parentheses, missing semicolon, property key not uppercase, key without values, unmatched bracket, unmatched parenthesis, escape at end of string |
| 193 | # Handled Edge Cases: Property key not uppercase, key without values, unmatched bracket |
| 194 | # Handled Edge Cases: Unmatched parenthesis, escape at end of string, consecutive nodes shorthand notation |