The deduplication extension isn't efficient due to the large number of duplicates we have—thousands in total. Therefore, I’ve created a script to merge and remove duplicate records based on identifying fields.
The script runs successfully when I don't merge linked record data, but this approach results in valuable information being lost. When I include the merging logic, the script works on smaller datasets (Although, take some time), but it crashes when processing larger datasets.
This is the error I get:
This is my script:
// Hard-coded configuration
let tableName = 'Contacts copy 3'; // Replace with your table name
let idFieldName = 'Contact'; // Replace with your identifying field name
let emailFieldName = 'Email Address 1'; // Field name for Email Address 1
let mobileFieldName = 'Mobile'; // Field name for Mobile
let viewName = 'test 2'; // Replace with the name of the view you want to use
// Retrieve the table
let table = base.getTable(tableName);
// Airtable limits batch operations to 50 records or fewer.
let maxRecordsPerCall = 50;
// Function to normalize text
function normalizeText(text) {
if (!text) return '';
return text
.toLowerCase() // Convert to lowercase
.normalize('NFKD') // Normalize accents
.replace(/[\u0300-\u036f]/g, '') // Remove accents
.replace(/[^\w\s]/g, '') // Remove punctuation
.replace(/\s+/g, ' ') // Normalize whitespace
.trim(); // Trim leading/trailing whitespace
}
// Function to create a key for identifying records
function createKey(record) {
let idValue = normalizeText(record.getCellValue(idFieldName));
let emailValue = normalizeText(record.getCellValue(emailFieldName));
let mobileValue = normalizeText(record.getCellValue(mobileFieldName));
return JSON.stringify([idValue, emailValue, mobileValue]);
}
let existing = Object.create(null);
let recordsToUpdate = [];
let recordsToDelete = [];
// Part 1: Identify records with matching contacts
let query = await table.selectRecordsAsync({
fields: [idFieldName, emailFieldName, mobileFieldName],
view: viewName // Query records from the selected view
});
for (let record of query.records) {
let key = createKey(record);
if (existing[key]) {
existing[key].push(record);
} else {
existing[key] = [record];
}
}
// Process records based on the conditions
for (let records of Object.values(existing)) {
if (records.length > 1) {
// Sort records by created time or other criteria
records.sort((a, b) => new Date(b.createdTime) - new Date(a.createdTime));
let masterRecord = records[0];
let contact = normalizeText(masterRecord.getCellValue(idFieldName));
let emailSet = new Set();
let mobileSet = new Set();
for (let record of records) {
let email = normalizeText(record.getCellValue(emailFieldName));
let mobile = normalizeText(record.getCellValue(mobileFieldName));
emailSet.add(email);
mobileSet.add(mobile);
// Prepare records for deletion
if (record.id !== masterRecord.id) {
recordsToDelete.push(record.id);
}
}
// Update the master record with merged fields
let mergedFields = {
[emailFieldName]: [...emailSet].join(', '),
[mobileFieldName]: [...mobileSet].join(', ')
};
if (Object.keys(mergedFields).length > 0) {
recordsToUpdate.push({
id: masterRecord.id,
fields: mergedFields
});
}
}
}
// Apply updates to merge fields
console.log(`Updating ${recordsToUpdate.length} records...`);
while (recordsToUpdate.length > 0) {
let batch = recordsToUpdate.slice(0, maxRecordsPerCall);
await table.updateRecordsAsync(batch);
recordsToUpdate = recordsToUpdate.slice(maxRecordsPerCall);
}
// Apply deletions
console.log(`Deleting ${recordsToDelete.length} records...`);
while (recordsToDelete.length > 0) {
let batch = recordsToDelete.slice(0, maxRecordsPerCall);
await table.deleteRecordsAsync(batch);
recordsToDelete = recordsToDelete.slice(maxRecordsPerCall);
}
console.log('Done');
Any suggestions?