Use fingerprints to classify 50k reaction data set by Schneider et al.. This notebook is based on the notebooks and code found in the SI of Development of a Novel Fingerprint for Chemical Reactions and Its Application to Large-Scale Reaction Classification and Similarity
train_df = schneider_df[schneider_df.split=='train']
test_df = schneider_df[schneider_df.split=='test']
print(len(train_df), len(test_df))
lr_cls = LogisticRegression(max_iter=1000)
scrambled_train_rxn_ids = [rxnclass2id[c] for c in train_df.rxn_class]
test_rxn_class_ids = [rxnclass2id[c] for c in test_df.rxn_class]
random.seed(42)
random.shuffle(scrambled_train_rxn_ids)
lr_classifier_scrambled = lr_cls.fit(train_df.ft_10k.values.tolist(), scrambled_train_rxn_ids)
confusion_matrix_scrambled = evaluate_model(lr_classifier_scrambled, test_df.ft_10k.values.tolist(), test_rxn_class_ids, all_classes, rxnclass2name)
fig = labelled_cmat(confusion_matrix_scrambled,
all_classes,
figsize=(16,12), label_extras=rxnclass2name)
With the randomized labels this is what we expected.
lr_cls = LogisticRegression(max_iter=5000)
lr_classifier_ft_10k_trained = lr_cls.fit(train_df.ft_10k.values.tolist(), train_df.class_id.values.tolist())
confusion_matrix_ft_10k = evaluate_model(lr_classifier_ft_10k_trained, test_df.ft_10k.values.tolist(), test_rxn_class_ids, all_classes, rxnclass2name)
fig = labelled_cmat(confusion_matrix_ft_10k,
all_classes,
figsize=(16,12), label_extras=rxnclass2name)
#fig.savefig('cm_ft_10k.pdf')
lr_cls = LogisticRegression(max_iter=5000)
lr_classifier_ft_pretrained = lr_cls.fit(train_df.ft_pretrained.values.tolist(), train_df.class_id.values.tolist())
confusion_matrix_pretrained = evaluate_model(lr_classifier_ft_pretrained, test_df.ft_pretrained.values.tolist(), test_rxn_class_ids, all_classes, rxnclass2name)
fig = labelled_cmat(confusion_matrix_pretrained,
all_classes,
figsize=(16,12), label_extras=rxnclass2name)
#fig.savefig('cm_ft_pretrained.pdf')
confusion_matrix_pretrained = evaluate_model(lr_classifier_ft_pretrained, test_df.ft_10k.values.tolist(), test_rxn_class_ids, all_classes, rxnclass2name)
fig = labelled_cmat(confusion_matrix_pretrained,
all_classes,
figsize=(16,12), label_extras=rxnclass2name)
#fig.savefig('cm_ft_pretrained_on_ft_10k.pdf')