Prepare reaction yield data from different sources
import pandas as pd
Buchwald-Hartwig high-throughput experiment data set
For the Buchwald-Hartwig reactions we used the data set and splits as provided by
Sandfort et al. "A structure-based platform for predicting chemical reactivity." Chem (2020).
and originally published in
Ahneman et al. "Predicting reaction performance in C–N cross-coupling using machine learning." Science 360.6385 (2018): 186-190..
expected_rxns = ['Clc1ccccn1.Cc1ccc(N)cc1.O=S(=O)(O[Pd]1c2ccccc2-c2ccccc2N~1)C(F)(F)F.COc1ccc(OC)c(P([C@]23C[C@H]4C[C@H](C[C@H](C4)C2)C3)[C@]23C[C@H]4C[C@H](C[C@H](C4)C2)C3)c1-c1c(C(C)C)cc(C(C)C)cc1C(C)C.CCN=P(N=P(N(C)C)(N(C)C)N(C)C)(N(C)C)N(C)C.Cc1cc(C)on1>>Cc1ccc(Nc2ccccn2)cc1',
'Brc1ccccn1.Cc1ccc(N)cc1.O=S(=O)(O[Pd]1c2ccccc2-c2ccccc2N~1)C(F)(F)F.COc1ccc(OC)c(P([C@]23C[C@H]4C[C@H](C[C@H](C4)C2)C3)[C@]23C[C@H]4C[C@H](C[C@H](C4)C2)C3)c1-c1c(C(C)C)cc(C(C)C)cc1C(C)C.CCN=P(N=P(N(C)C)(N(C)C)N(C)C)(N(C)C)N(C)C.COC(=O)c1ccno1>>Cc1ccc(Nc2ccccn2)cc1',
'CCc1ccc(I)cc1.Cc1ccc(N)cc1.O=S(=O)(O[Pd]1c2ccccc2-c2ccccc2N~1)C(F)(F)F.CC(C)c1cc(C(C)C)c(-c2ccccc2P(C2CCCCC2)C2CCCCC2)c(C(C)C)c1.CCN=P(N=P(N(C)C)(N(C)C)N(C)C)(N(C)C)N(C)C.COC(=O)c1ccno1>>CCc1ccc(Nc2ccc(C)cc2)cc1',
'FC(F)(F)c1ccc(Cl)cc1.Cc1ccc(N)cc1.O=S(=O)(O[Pd]1c2ccccc2-c2ccccc2N~1)C(F)(F)F.COc1ccc(OC)c(P(C(C)(C)C)C(C)(C)C)c1-c1c(C(C)C)cc(C(C)C)cc1C(C)C.CN1CCCN2CCCN=C12.CCOC(=O)c1cnoc1>>Cc1ccc(Nc2ccc(C(F)(F)F)cc2)cc1',
'COc1ccc(Cl)cc1.Cc1ccc(N)cc1.O=S(=O)(O[Pd]1c2ccccc2-c2ccccc2N~1)C(F)(F)F.COc1ccc(OC)c(P([C@]23C[C@H]4C[C@H](C[C@H](C4)C2)C3)[C@]23C[C@H]4C[C@H](C[C@H](C4)C2)C3)c1-c1c(C(C)C)cc(C(C)C)cc1C(C)C.CN1CCCN2CCCN=C12.Cc1cc(C)on1>>COc1ccc(Nc2ccc(C)cc2)cc1']
test_df = pd.DataFrame({"Ligand":{"0":"CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[C@@H]4C5)C[C@H](C4)C[C@H]5C3)[C@]6(C7)C[C@@H](C[C@@H]7C8)C[C@@H]8C6)C(OC)=CC=C2OC","1":"CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[C@@H]4C5)C[C@H](C4)C[C@H]5C3)[C@]6(C7)C[C@@H](C[C@@H]7C8)C[C@@H]8C6)C(OC)=CC=C2OC","2":"CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P(C3CCCCC3)C4CCCCC4)C=CC=C2","3":"CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P(C(C)(C)C)C(C)(C)C)C(OC)=CC=C2OC","4":"CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[C@@H]4C5)C[C@H](C4)C[C@H]5C3)[C@]6(C7)C[C@@H](C[C@@H]7C8)C[C@@H]8C6)C(OC)=CC=C2OC"},"Additive":{"0":"CC1=CC(C)=NO1","1":"O=C(OC)C1=CC=NO1","2":"O=C(OC)C1=CC=NO1","3":"CCOC(C1=CON=C1)=O","4":"CC1=CC(C)=NO1"},"Base":{"0":"CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC","1":"CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC","2":"CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC","3":"CN1CCCN2C1=NCCC2","4":"CN1CCCN2C1=NCCC2"},"Aryl halide":{"0":"ClC1=NC=CC=C1","1":"BrC1=NC=CC=C1","2":"IC1=CC=C(CC)C=C1","3":"ClC1=CC=C(C(F)(F)F)C=C1","4":"ClC1=CC=C(OC)C=C1"},"Output":{"0":70.41045785,"1":11.06445724,"2":10.22354965,"3":20.0833829,"4":0.492662711}})
converted_rxns = generate_buchwald_hartwig_rxns(test_df)
for rxn, expected in zip(converted_rxns, expected_rxns):
assert rxn == expected
df = pd.read_excel('../data/Suzuki-Miyaura/aap9112_Data_File_S1.xlsx')
reactant_1_smiles = {
'6-chloroquinoline': 'C1=C(Cl)C=CC2=NC=CC=C12.CCC1=CC(=CC=C1)CC',
'6-Bromoquinoline': 'C1=C(Br)C=CC2=NC=CC=C12.CCC1=CC(=CC=C1)CC',
'6-triflatequinoline': 'C1C2C(=NC=CC=2)C=CC=1OS(C(F)(F)F)(=O)=O.CCC1=CC(=CC=C1)CC',
'6-Iodoquinoline': 'C1=C(I)C=CC2=NC=CC=C12.CCC1=CC(=CC=C1)CC',
'6-quinoline-boronic acid hydrochloride': 'C1C(B(O)O)=CC=C2N=CC=CC=12.Cl.O',
'Potassium quinoline-6-trifluoroborate': '[B-](C1=CC2=C(C=C1)N=CC=C2)(F)(F)F.[K+].O',
'6-Quinolineboronic acid pinacol ester': 'B1(OC(C(O1)(C)C)(C)C)C2=CC3=C(C=C2)N=CC=C3.O'
}
reactant_2_smiles = {
'2a, Boronic Acid': 'CC1=CC=C2C(C=NN2C3OCCCC3)=C1B(O)O',
'2b, Boronic Ester': 'CC1=CC=C2C(C=NN2C3OCCCC3)=C1B4OC(C)(C)C(C)(C)O4',
'2c, Trifluoroborate': 'CC1=CC=C2C(C=NN2C3OCCCC3)=C1[B-](F)(F)F.[K+]',
'2d, Bromide': 'CC1=CC=C2C(C=NN2C3OCCCC3)=C1Br'
}
catalyst_smiles = {
'Pd(OAc)2': 'CC(=O)O~CC(=O)O~[Pd]'
}
ligand_smiles = {
'P(tBu)3': 'CC(C)(C)P(C(C)(C)C)C(C)(C)C',
'P(Ph)3 ': 'c3c(P(c1ccccc1)c2ccccc2)cccc3',
'AmPhos': 'CC(C)(C)P(C1=CC=C(C=C1)N(C)C)C(C)(C)C',
'P(Cy)3': 'C1(CCCCC1)P(C2CCCCC2)C3CCCCC3',
'P(o-Tol)3': 'CC1=CC=CC=C1P(C2=CC=CC=C2C)C3=CC=CC=C3C',
'CataCXium A': 'CCCCP(C12CC3CC(C1)CC(C3)C2)C45CC6CC(C4)CC(C6)C5',
'SPhos': 'COc1cccc(c1c2ccccc2P(C3CCCCC3)C4CCCCC4)OC',
'dtbpf': 'CC(C)(C)P(C1=CC=C[CH]1)C(C)(C)C.CC(C)(C)P(C1=CC=C[CH]1)C(C)(C)C.[Fe]',
'XPhos': 'P(c2ccccc2c1c(cc(cc1C(C)C)C(C)C)C(C)C)(C3CCCCC3)C4CCCCC4',
'dppf': 'C1=CC=C(C=C1)P([C-]2C=CC=C2)C3=CC=CC=C3.C1=CC=C(C=C1)P([C-]2C=CC=C2)C3=CC=CC=C3.[Fe+2]',
'Xantphos': 'O6c1c(cccc1P(c2ccccc2)c3ccccc3)C(c7cccc(P(c4ccccc4)c5ccccc5)c67)(C)C',
'None': ''
}
reagent_1_smiles = {
'NaOH': '[OH-].[Na+]',
'NaHCO3': '[Na+].OC([O-])=O',
'CsF': '[F-].[Cs+]',
'K3PO4': '[K+].[K+].[K+].[O-]P([O-])([O-])=O',
'KOH': '[K+].[OH-]',
'LiOtBu': '[Li+].[O-]C(C)(C)C',
'Et3N': 'CCN(CC)CC',
'None': ''
}
solvent_1_smiles = {
'MeCN': 'CC#N.O',
'THF': 'C1CCOC1.O',
'DMF': 'CN(C)C=O.O',
'MeOH': 'CO.O',
'MeOH/H2O_V2 9:1': 'CO.O',
'THF_V2': 'C1CCOC1.O'
}
def make_reaction_smiles(row):
precursors = f" {reactant_1_smiles[row['Reactant_1_Name']]}.{reactant_2_smiles[row['Reactant_2_Name']]}.{catalyst_smiles[row['Catalyst_1_Short_Hand']]}.{ligand_smiles[row['Ligand_Short_Hand']]}.{reagent_1_smiles[row['Reagent_1_Short_Hand']]}.{solvent_1_smiles[row['Solvent_1_Short_Hand']]} "
product = 'C1=C(C2=C(C)C=CC3N(C4OCCCC4)N=CC2=3)C=CC2=NC=CC=C12'
# print(precursors, product)
can_precursors = Chem.MolToSmiles(Chem.MolFromSmiles(precursors.replace('...', '.').replace('..', '.').replace(' .', '').replace('. ', '').replace(' ', '')))
can_product = Chem.MolToSmiles(Chem.MolFromSmiles(product))
return f"{can_precursors}>>{can_product}"
df['rxn']= [make_reaction_smiles(row) for i, row in df.iterrows()]
df['y'] = df['Product_Yield_PCT_Area_UV']/ 100.
reactions_df = df[['rxn', 'y']]
for seed in range(10):
new_df = reactions_df.sample(frac=1, random_state=seed)[['rxn', 'y']]
new_df.to_csv(f'../data/Suzuki-Miyaura/random_splits/random_split_{seed}.tsv', sep='\t')
print(new_df.head(2))