Introduction

Transforming legacy system data for migration to Salesforce doesn’t have to be a chore. In this instructional video, I’ll show you how to to use Python to easily perform and automate Salesforce data transformation tasks. We’ll cover each step in the process – from initial data analysis to data transformations and Salesforce data loads.  I’ll also provide my top 5 recommendations to ensure your migrations are trouble-free.

In addition to Python, we’ll also learn how to use Simple Salesforce, Pandas, pypyodbc, Ant and batch files to help us automate the entire data analysis, transformation and load process.  I provide the files I use in my demo below the video.

I have a companion article that provides additional Python commands and automations that you will find useful. You can find the article on our RadixBay blog page.

If you  would like a copy of the slides I used in the video, you can download them here. 

Video Demo – Automating Salesforce Data Migrations with Python

Files I Use in the Video Demo.

Please note that these files are for instructional purposes only.   They will require modifications to work in other environments.   There is no explicit or implied guarantee.   Use at your own risk and lower that risk by performing extensive testing.

DataAnalysis.bat

@ECHO OFF
title Data Analysis Tool

ECHO Welcome to RadixBay’s Data Analysis Tool.

:AccountAnalysis
python C:\Users\kimsmall\Documents\BaseMigrationRepo\data_migration\accounts\account_analysis.py
ECHO AccountAnalysis complete.

ECHO Analysis Complete.
pause
EXIT

 

account_analysis.py

”’
Analyzes the account tables.
”’

import datetime
from pathlib import Path

import pandas as pd

from utils import analyze_data

# pylint: disable=line-too-long, invalid-name

print(datetime.datetime.now().strftime(“%I:%M:%S %p”) + “: [AccountAnalysis] Loading account files.”)

# Define Foler Paths of Required Tables
folder = Path(“C:/Users/kimsmall/Documents/BaseMigrationData”)
acc_file = folder / “ClientData/Account.csv”
acc_analysis = folder / “DataAnalysis/AccountAnalysis.xlsx”

# Convert CSVs to Dataframes
acc_table = pd.read_csv(acc_file, engine=’python’)

print(datetime.datetime.now().strftime(“%I:%M:%S %p”) + “: [AccountAnalysis] Account files loaded. Starting data analysis.”)

# Analyze the data and print results to an excel sheet
analyze_data(acc_table, acc_analysis)

print(datetime.datetime.now().strftime(“%I:%M:%S %p”) + “: [AccountAnalysis] Data analysis complete.”)

 

DataMigration.bat

@ECHO OFF
title Data Migration Tool

ECHO Welcome to RadixBay’s Data Migration Tool.
pause

:AccountPrep
python C:\Users\kimsmall\Documents\BaseMigrationRepo\data_migration\accounts\account_data_prep.py
ECHO AccountPrep complete.
pause

:AccountLoad
call process.bat “C:\Users\kimsmall\Documents\BaseMigrationRepo\data_migration\config” upsertAccounts
ECHO upsertAccounts complete.
pause

:End
ECHO Migration Complete.
pause
EXIT

 

account_data_prep.py

”’
Preps the Account data before loading into Access database.
”’

import datetime
from pathlib import Path
import pandas as pd

from account_constants import ACCOUNT_TABLE_NAME, ACCOUNT_CREATE_QUERY, UPDATE_RATING
from global_constants import JOB_ID, RENAME_ID
from utils import insert_data

# pylint: disable=line-too-long, invalid-name

print(datetime.datetime.now().strftime(“%I:%M:%S %p”) + “: [AccountPrep] Loading account files.”)

# Define Folder Paths of Required Tables
folder = Path(“C:/Users/kimsmall/Documents/BaseMigrationData”)
account_file = folder / “ClientData/Account.csv”

# Convert CSVs to Dataframes
account_table = pd.read_csv(account_file, engine=’python’)

print(datetime.datetime.now().strftime(“%I:%M:%S %p”) + “: [AccountPrep] Account files loaded. Starting data preparation.”)

# Rename columns
account_table = account_table.rename(columns=RENAME_ID)

# Update values
account_table = account_table.replace(UPDATE_RATING)

# Add the Job IDs
account_table[‘Job_Id__c’] = JOB_ID

# Remove NaN values
account_table = account_table.replace({pd.np.nan: None})

print(datetime.datetime.now().strftime(“%I:%M:%S %p”) + “: [AccountPrep] Data preparation complete.”)

# Insert content data into Access database
insert_data(account_table, ACCOUNT_TABLE_NAME, ACCOUNT_CREATE_QUERY)

 

utils.py

 

”’
Utilities that will be used throughout app
”’

import datetime
import os

import numpy as np
import pandas as pd
import win32com.client

import pypyodbc

# pylint: disable=line-too-long

def insert_data(dataframe, table_name, create_query):
”’Inserts data into Access database”’
print(datetime.datetime.now().strftime(“%I:%M:%S %p”) + “: [ACCESSDB] Connecting to database.”)

# Connect to Database
connection_string = (
r’Driver={Microsoft Access Driver (*.mdb, *.accdb)};’
r’DBQ=C:\Users\kimsmall\Documents\BaseMigrationData\BaseMigrationDatabase.accdb’
)

db_connection = pypyodbc.connect(connection_string, autocommit=True) # pylint: disable=c-extension-no-member
cursor = db_connection.cursor()

print(datetime.datetime.now().strftime(“%I:%M:%S %p”) + “: [ACCESSDB] Connected to database. Inserting data into database.”)

# Define boolean that will determine if a table was removed
removed_table = False

# If the table already exists, remove the current table
if list(cursor.tables(table_name)):
cursor.execute(f”DROP TABLE [{table_name}]”)
removed_table = True

# Create the new table
cursor.execute(create_query.format(name=table_name))

# Create column list
column_list = “`,`”.join([str(i) for i in dataframe.columns])

# Insert the rows from the Dataframe into the Access table
for i, row in dataframe.iterrows(): # pylint: disable=unused-variable
insert_query = “INSERT INTO {name} (`” +column_list + “`) VALUES (” + “?,”*(len(row)-1) + “?)”
cursor.execute(insert_query.format(name=table_name), tuple(row))

print(datetime.datetime.now().strftime(“%I:%M:%S %p”) + “: [ACCESSDB] Insertion of data complete.”)

# Close out the database connection
db_connection.close()

# If a table was removed from the database, compact and repair the database
if removed_table:
compact_and_repair_database()

def compact_and_repair_database():
”’Compacts and repairs the Access database after removing table”’
print(datetime.datetime.now().strftime(“%I:%M:%S %p”) + “: [ACCESSDB] Compacting and repairing the database.”)

# Define the paths to the database and its backup
db_path = r’C:\Users\kimsmall\Documents\BaseMigrationData\BaseMigrationDatabase.accdb’
backup_db_path = r’C:\Users\kimsmall\Documents\BaseMigrationData\BaseMigrationDatabase_backup.accdb’

# Compact and Repair the Access Database
o_app = win32com.client.Dispatch(“Access.Application”)
o_app.compactRepair(db_path, backup_db_path)

# Remove backup and reset oApp
os.remove(backup_db_path)
o_app = None

def analyze_data(table, result_file):
”’Analyzes any dataframes passed in.”’
# Define lists that we will need
cols = []
max_length = []
count = []
unique_count = []

# Get the shape of the data table
data_shape = table.shape
summary = pd.DataFrame(index=[‘Number of Records’, ‘Number of Columns’])
summary[‘Counts’] = data_shape

# Replace NaN values with None
# table = table.replace({pd.np.nan: None})

# Get a list of columns from the data
for col in table.columns:
cols.append(col)
analysis = pd.DataFrame(index=cols)

# Get a list of columns from the data
for col in table.columns:
string_col = table[col].astype(str)
string_col = string_col.replace(‘nan’, ”)
if table[col].dtype == ‘int64’ or table[col].dtype == ‘float64’:
string_col = string_col.replace(r’.0′, ”, regex=True)
max_length.append(int(string_col.str.len().max()))
else:
max_length.append(int(string_col.str.len().max()))
analysis[‘Max Length’] = max_length

# Get the counts and print to excel sheet
column_count = table.count()
for i in range(0, column_count.index.size):
count.append(column_count.iloc[i])
analysis[‘Count’] = count

# Get the unique counts and print to excel sheet
unique_vals = table.nunique()
for i in range(0, unique_vals.index.size):
unique_count.append(unique_vals.iloc[i])
analysis[‘Distinct Count’] = unique_count

# Determine whether a field is nullable
analysis[‘Nullable’] = np.where(
analysis.Count < summary.at[‘Number of Records’, ‘Counts’], ‘Yes’, ‘No’)

# Determine the field usage percentage
record_count = table.shape[0]
analysis[‘Field Usage’] = analysis[‘Count’] / record_count

# Print results to excel sheet
writer = pd.ExcelWriter(result_file, engine=’xlsxwriter’)
summary.to_excel(writer, sheet_name=”Summary”)
analysis.to_excel(writer, sheet_name=”Analysis”)
writer.save()

 

I hope you enjoyed my video on using Python for Salesforce data transformation.

To learn more about our Salesforce services and how RadixBay can help, please visit our RadixBay Salesforce Center of Excellence.

Kimberly Small
RadixBay Salesforce Consultant
Salesforce Certified Platform Developer I
Salesforce Certified Platform App Builder