Prepare reaction yield data from different sources
import pandas as pd

Buchwald-Hartwig high-throughput experiment data set

For the Buchwald-Hartwig reactions we used the data set and splits as provided by

Sandfort et al. "A structure-based platform for predicting chemical reactivity." Chem (2020).

and originally published in

Ahneman et al. "Predicting reaction performance in C–N cross-coupling using machine learning." Science 360.6385 (2018): 186-190..

canonicalize_with_dict[source]

canonicalize_with_dict(smi, can_smi_dict={})

generate_buchwald_hartwig_rxns[source]

generate_buchwald_hartwig_rxns(df)

Converts the entries in the excel files from Sandfort et al. to reaction SMILES.

expected_rxns = ['Clc1ccccn1.Cc1ccc(N)cc1.O=S(=O)(O[Pd]1c2ccccc2-c2ccccc2N~1)C(F)(F)F.COc1ccc(OC)c(P([C@]23C[C@H]4C[C@H](C[C@H](C4)C2)C3)[C@]23C[C@H]4C[C@H](C[C@H](C4)C2)C3)c1-c1c(C(C)C)cc(C(C)C)cc1C(C)C.CCN=P(N=P(N(C)C)(N(C)C)N(C)C)(N(C)C)N(C)C.Cc1cc(C)on1>>Cc1ccc(Nc2ccccn2)cc1',
 'Brc1ccccn1.Cc1ccc(N)cc1.O=S(=O)(O[Pd]1c2ccccc2-c2ccccc2N~1)C(F)(F)F.COc1ccc(OC)c(P([C@]23C[C@H]4C[C@H](C[C@H](C4)C2)C3)[C@]23C[C@H]4C[C@H](C[C@H](C4)C2)C3)c1-c1c(C(C)C)cc(C(C)C)cc1C(C)C.CCN=P(N=P(N(C)C)(N(C)C)N(C)C)(N(C)C)N(C)C.COC(=O)c1ccno1>>Cc1ccc(Nc2ccccn2)cc1',
 'CCc1ccc(I)cc1.Cc1ccc(N)cc1.O=S(=O)(O[Pd]1c2ccccc2-c2ccccc2N~1)C(F)(F)F.CC(C)c1cc(C(C)C)c(-c2ccccc2P(C2CCCCC2)C2CCCCC2)c(C(C)C)c1.CCN=P(N=P(N(C)C)(N(C)C)N(C)C)(N(C)C)N(C)C.COC(=O)c1ccno1>>CCc1ccc(Nc2ccc(C)cc2)cc1',
 'FC(F)(F)c1ccc(Cl)cc1.Cc1ccc(N)cc1.O=S(=O)(O[Pd]1c2ccccc2-c2ccccc2N~1)C(F)(F)F.COc1ccc(OC)c(P(C(C)(C)C)C(C)(C)C)c1-c1c(C(C)C)cc(C(C)C)cc1C(C)C.CN1CCCN2CCCN=C12.CCOC(=O)c1cnoc1>>Cc1ccc(Nc2ccc(C(F)(F)F)cc2)cc1',
 'COc1ccc(Cl)cc1.Cc1ccc(N)cc1.O=S(=O)(O[Pd]1c2ccccc2-c2ccccc2N~1)C(F)(F)F.COc1ccc(OC)c(P([C@]23C[C@H]4C[C@H](C[C@H](C4)C2)C3)[C@]23C[C@H]4C[C@H](C[C@H](C4)C2)C3)c1-c1c(C(C)C)cc(C(C)C)cc1C(C)C.CN1CCCN2CCCN=C12.Cc1cc(C)on1>>COc1ccc(Nc2ccc(C)cc2)cc1']
test_df = pd.DataFrame({"Ligand":{"0":"CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[C@@H]4C5)C[C@H](C4)C[C@H]5C3)[C@]6(C7)C[C@@H](C[C@@H]7C8)C[C@@H]8C6)C(OC)=CC=C2OC","1":"CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[C@@H]4C5)C[C@H](C4)C[C@H]5C3)[C@]6(C7)C[C@@H](C[C@@H]7C8)C[C@@H]8C6)C(OC)=CC=C2OC","2":"CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P(C3CCCCC3)C4CCCCC4)C=CC=C2","3":"CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P(C(C)(C)C)C(C)(C)C)C(OC)=CC=C2OC","4":"CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[C@@H]4C5)C[C@H](C4)C[C@H]5C3)[C@]6(C7)C[C@@H](C[C@@H]7C8)C[C@@H]8C6)C(OC)=CC=C2OC"},"Additive":{"0":"CC1=CC(C)=NO1","1":"O=C(OC)C1=CC=NO1","2":"O=C(OC)C1=CC=NO1","3":"CCOC(C1=CON=C1)=O","4":"CC1=CC(C)=NO1"},"Base":{"0":"CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC","1":"CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC","2":"CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC","3":"CN1CCCN2C1=NCCC2","4":"CN1CCCN2C1=NCCC2"},"Aryl halide":{"0":"ClC1=NC=CC=C1","1":"BrC1=NC=CC=C1","2":"IC1=CC=C(CC)C=C1","3":"ClC1=CC=C(C(F)(F)F)C=C1","4":"ClC1=CC=C(OC)C=C1"},"Output":{"0":70.41045785,"1":11.06445724,"2":10.22354965,"3":20.0833829,"4":0.492662711}})
converted_rxns = generate_buchwald_hartwig_rxns(test_df)
for rxn, expected in zip(converted_rxns, expected_rxns):
    assert rxn == expected

Suzuki-Miyaura reaction data set

Originally published in

Perera et al. "A platform for automated nanomole-scale reaction screening and micromole-scale synthesis in flow." Science 359.6374 (2018): 429-434.
df = pd.read_excel('../data/Suzuki-Miyaura/aap9112_Data_File_S1.xlsx')
reactant_1_smiles = {
    '6-chloroquinoline': 'C1=C(Cl)C=CC2=NC=CC=C12.CCC1=CC(=CC=C1)CC', 
    '6-Bromoquinoline': 'C1=C(Br)C=CC2=NC=CC=C12.CCC1=CC(=CC=C1)CC', 
    '6-triflatequinoline': 'C1C2C(=NC=CC=2)C=CC=1OS(C(F)(F)F)(=O)=O.CCC1=CC(=CC=C1)CC',
    '6-Iodoquinoline': 'C1=C(I)C=CC2=NC=CC=C12.CCC1=CC(=CC=C1)CC', 
    '6-quinoline-boronic acid hydrochloride': 'C1C(B(O)O)=CC=C2N=CC=CC=12.Cl.O',
    'Potassium quinoline-6-trifluoroborate': '[B-](C1=CC2=C(C=C1)N=CC=C2)(F)(F)F.[K+].O',
    '6-Quinolineboronic acid pinacol ester': 'B1(OC(C(O1)(C)C)(C)C)C2=CC3=C(C=C2)N=CC=C3.O'
}

reactant_2_smiles = {
    '2a, Boronic Acid': 'CC1=CC=C2C(C=NN2C3OCCCC3)=C1B(O)O', 
    '2b, Boronic Ester': 'CC1=CC=C2C(C=NN2C3OCCCC3)=C1B4OC(C)(C)C(C)(C)O4', 
    '2c, Trifluoroborate': 'CC1=CC=C2C(C=NN2C3OCCCC3)=C1[B-](F)(F)F.[K+]',
    '2d, Bromide': 'CC1=CC=C2C(C=NN2C3OCCCC3)=C1Br' 
}

catalyst_smiles = {
    'Pd(OAc)2': 'CC(=O)O~CC(=O)O~[Pd]'
}

ligand_smiles = {
    'P(tBu)3': 'CC(C)(C)P(C(C)(C)C)C(C)(C)C', 
    'P(Ph)3 ': 'c3c(P(c1ccccc1)c2ccccc2)cccc3', 
    'AmPhos': 'CC(C)(C)P(C1=CC=C(C=C1)N(C)C)C(C)(C)C', 
    'P(Cy)3': 'C1(CCCCC1)P(C2CCCCC2)C3CCCCC3', 
    'P(o-Tol)3': 'CC1=CC=CC=C1P(C2=CC=CC=C2C)C3=CC=CC=C3C',
    'CataCXium A': 'CCCCP(C12CC3CC(C1)CC(C3)C2)C45CC6CC(C4)CC(C6)C5', 
    'SPhos': 'COc1cccc(c1c2ccccc2P(C3CCCCC3)C4CCCCC4)OC', 
    'dtbpf': 'CC(C)(C)P(C1=CC=C[CH]1)C(C)(C)C.CC(C)(C)P(C1=CC=C[CH]1)C(C)(C)C.[Fe]', 
    'XPhos': 'P(c2ccccc2c1c(cc(cc1C(C)C)C(C)C)C(C)C)(C3CCCCC3)C4CCCCC4', 
    'dppf': 'C1=CC=C(C=C1)P([C-]2C=CC=C2)C3=CC=CC=C3.C1=CC=C(C=C1)P([C-]2C=CC=C2)C3=CC=CC=C3.[Fe+2]', 
    'Xantphos': 'O6c1c(cccc1P(c2ccccc2)c3ccccc3)C(c7cccc(P(c4ccccc4)c5ccccc5)c67)(C)C',
    'None': ''
}

reagent_1_smiles = {
    'NaOH': '[OH-].[Na+]', 
    'NaHCO3': '[Na+].OC([O-])=O', 
    'CsF': '[F-].[Cs+]', 
    'K3PO4': '[K+].[K+].[K+].[O-]P([O-])([O-])=O', 
    'KOH': '[K+].[OH-]', 
    'LiOtBu': '[Li+].[O-]C(C)(C)C', 
    'Et3N': 'CCN(CC)CC', 
    'None': ''
}

solvent_1_smiles = {
    'MeCN': 'CC#N.O', 
    'THF': 'C1CCOC1.O', 
    'DMF': 'CN(C)C=O.O', 
    'MeOH': 'CO.O', 
    'MeOH/H2O_V2 9:1': 'CO.O', 
    'THF_V2': 'C1CCOC1.O'
}

def make_reaction_smiles(row):
    precursors = f" {reactant_1_smiles[row['Reactant_1_Name']]}.{reactant_2_smiles[row['Reactant_2_Name']]}.{catalyst_smiles[row['Catalyst_1_Short_Hand']]}.{ligand_smiles[row['Ligand_Short_Hand']]}.{reagent_1_smiles[row['Reagent_1_Short_Hand']]}.{solvent_1_smiles[row['Solvent_1_Short_Hand']]} "
    product = 'C1=C(C2=C(C)C=CC3N(C4OCCCC4)N=CC2=3)C=CC2=NC=CC=C12'
#     print(precursors, product)
    can_precursors = Chem.MolToSmiles(Chem.MolFromSmiles(precursors.replace('...', '.').replace('..', '.').replace(' .', '').replace('. ', '').replace(' ', '')))
    can_product = Chem.MolToSmiles(Chem.MolFromSmiles(product))
    
    return f"{can_precursors}>>{can_product}"

Random splits

df['rxn']= [make_reaction_smiles(row) for i, row in df.iterrows()]
df['y'] = df['Product_Yield_PCT_Area_UV']/ 100.
reactions_df = df[['rxn', 'y']]

for seed in range(10):
    new_df = reactions_df.sample(frac=1, random_state=seed)[['rxn', 'y']]
    new_df.to_csv(f'../data/Suzuki-Miyaura/random_splits/random_split_{seed}.tsv', sep='\t')
    print(new_df.head(2))
                                                   rxn         y
464  Brc1ccc2ncccc2c1.CC#N.CC(=O)O~CC(=O)O~[Pd].CC(...  0.643011
98   C1CCOC1.CC(=O)O~CC(=O)O~[Pd].CCc1cccc(CC)c1.CN...  0.612001
                                                    rxn         y
473   Brc1ccc2ncccc2c1.CC#N.CC(=O)O~CC(=O)O~[Pd].CCC...  0.918077
5519  CC#N.CC(=O)O~CC(=O)O~[Pd].CC(C)(C)[O-].CC1(C)O...  0.513346
                                                    rxn         y
4944  CC(=O)O~CC(=O)O~[Pd].CC(C)(C)P(C(C)(C)C)C(C)(C...  0.233489
3927  Brc1ccc2ncccc2c1.CC#N.CC(=O)O~CC(=O)O~[Pd].CCc...  0.178764
                                                   rxn         y
914  C1CCOC1.CC(=O)O~CC(=O)O~[Pd].CCc1cccc(CC)c1.CN...  0.114641
940  C1CCOC1.CC(=O)O~CC(=O)O~[Pd].CCN(CC)CC.CCc1ccc...  0.540637
                                                    rxn         y
3896  Brc1ccc2ncccc2c1.CC#N.CC(=O)O~CC(=O)O~[Pd].CC(...  0.134639
1991  CC#N.CC(=O)O~CC(=O)O~[Pd].CC(C)(C)[O-].CCc1ccc...  0.654905
                                                    rxn         y
1872  CC(=O)O~CC(=O)O~[Pd].CC(C)(C)P(C(C)(C)C)C(C)(C...  0.193283
1879  CC(=O)O~CC(=O)O~[Pd].CC(C)(C)P(C1=CC=C[CH]1)C(...  0.503925
                                                    rxn         y
5207  C1CCOC1.CC(=O)O~CC(=O)O~[Pd].Cc1ccc2c(cnn2C2CC...  0.288076
2444  Brc1ccc2ncccc2c1.C1CCOC1.CC(=O)O~CC(=O)O~[Pd]....  0.412966
                                                    rxn         y
5759  CC(=O)O~CC(=O)O~[Pd].CC1(C)OB(c2ccc3ncccc3c2)O...  0.541974
651   Brc1ccc2ncccc2c1.C1CCC(P(C2CCCCC2)C2CCCCC2)CC1...  0.382453
                                                    rxn         y
1133  CC(=O)O~CC(=O)O~[Pd].CCCCP(C12CC3CC(CC(C3)C1)C...  0.808541
4835  C1CCOC1.CC(=O)O~CC(=O)O~[Pd].Cc1ccc2c(cnn2C2CC...  0.349989
                                                    rxn         y
1412  CC(=O)O~CC(=O)O~[Pd].CC(C)(C)[O-].CC(C)c1cc(C(...  0.752941
886   C1CCOC1.CC(=O)O~CC(=O)O~[Pd].CC1(C)c2cccc(P(c3...  0.082582