Skip to content

Commit 592b598

Browse files
committed
add tokenization
1 parent 96dd4ab commit 592b598

File tree

1 file changed

+3
-2
lines changed

1 file changed

+3
-2
lines changed

chebai/preprocessing/reader.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -387,15 +387,16 @@ def _read_data(self, raw_data: str) -> Optional[List[int]]:
387387
try:
388388
mol = Chem.MolFromSmiles(raw_data, sanitize=False)
389389
inchi = Chem.MolToInchi(mol)
390+
tokenized = [self._get_token_index(v) for v in inchi]
390391
except Exception:
391392
print(f"could not process {raw_data}")
392393
# print(f'\t{e}')
393394
self.error_count += 1
394395
print(f"\terror count: {self.error_count}")
395-
inchi = None
396+
tokenized = None
396397
# if self.error_count > 20:
397398
# raise Exception('Too many errors')
398-
return inchi # one letter = one token
399+
return tokenized # one letter = one token
399400

400401

401402
class OrdReader(DataReader):

0 commit comments

Comments
 (0)