Diakonos1984
09-06-2013, 07:12 AM
I have a large spreadsheet (5000+ rows, with 50+ columns). It is a database report of items that have relationships, and it ends up listing every possible permutation of the related items. Roughly half of this spreadsheet ends up being rows made up entirely of duplicate data.
What I need to be able to do is search selected columns and identify duplicate data, differentiating between first instance and subsequent instances. Then I need to delete each row that contains ONLY subsequent duplicate data. If a row contains even one unique or first instance item, it is NOT to be deleted.
I have written a series of subroutines to do this, and they mostly work. It's just super slow. Can you identify some performance killers and help me speed this up?
I have uploaded a half-size version of the spreadsheet if you want to play with it. The full version is too big! 10552
On this half-size version with 3727 rows, the whole routine takes about 20 minutes and deletes 2549 rows.
Complete code is below:
Sub SKMMainRPurgeDups() 'Main wrapper for finding duplicates in each column and then deleting rows containing only Subsequent duplicates
Dim wsTarget As Worksheet
Dim strStarTime As String
Dim strStopTime As String
strStartTime = Now()
Set wsTarget = ActiveSheet
SKMMainRIDDups wsTarget, vbGreen, vbRed
SKMMainRDeleteDups wsTarget, vbGreen, vbRed
wsTarget.ListObjects(1).DataBodyRange.Interior.ColorIndex = xlNone
strStopTime = Now()
Debug.Print "Started at " & strStartTime & "; finished at " & strStopTime
Beep 'since I'm probably not paying attention....
End Sub
'####################################################
Sub SKMMainRIDDups(wsTarget As Worksheet, First_Color As Variant, Subsequent_Color As Variant)
'Identifies duplicates in each column
Dim indTarCol As Long
Dim indTarCols As Long
Debug.Print "Starting SKMMainRIDDups: " & Now()
Application.ScreenUpdating = False
Set wsTarget = ActiveSheet
indTarCols = wsTarget.ListObjects(1).ListColumns.Count
'This For statement and the following two If statements limit my searching to specific columns. I don't care about duplicates in other columns.
For indTarCol = 5 To indTarCols
If Not wsTarget.ListObjects(1).HeaderRowRange.Cells(1, indTarCol).Find("tbl", LookIn:=xlValues, LookAt:=xlPart) Is Nothing Then
If Not wsTarget.ListObjects(1).HeaderRowRange.Cells(1, indTarCol).Find("Status", LookIn:=xlValues, LookAt:=xlPart) Is Nothing Then
Else
'wsTarget.ListObjects(1).ListColumns(indTarCol).DataBodyRange.Select 'for watching
MiscHighlightDup wsTarget.ListObjects(1).ListColumns(indTarCol).DataBodyRange, First_Color, Subsequent_Color
End If
End If
Next indTarCol
Application.ScreenUpdating = True
Debug.Print "Finished SKMMainRIDDups: " & Now()
End Sub
'####################################################
Sub SKMMainRDeleteDups(wsTarget As Worksheet, First_Color As Variant, Subsequent_Color As Variant)
'Deletes rows containing only subsequent duplicates, as determined by SKMMainRIDDups
Dim rngTarget As Range
Dim tarCell As Range
Dim indTarRow As Long 'row index in Target
Dim indTarCol As Long 'column index in Target
Dim indTarRows As Long 'number of rows in Target
Dim indTarCols As Long 'number of columns in Target
Dim flDontDelete As Boolean
Dim indRowsDeleted As Long 'statistics
Debug.Print "Starting SKMMainRDeleteDups: " & Now()
Application.ScreenUpdating = False
Set rngTarget = wsTarget.ListObjects(1).DataBodyRange
indTarRows = rngTarget.Rows.Count
indTarCols = rngTarget.Columns.Count
indRowsDeleted = 0
For indTarRow = indTarRows To 1 Step (-1) 'To delete from Table need to start at bottom
flDontDelete = False 'we're going to delete this row unless we find a good reason not to!
'This For statement and the following two If statements limit my searching to specific columns. I don't care about duplicates in other columns.
For indTarCol = 5 To indTarCols
If Not wsTarget.ListObjects(1).HeaderRowRange.Cells(1, indTarCol).Find("tbl", LookIn:=xlValues, LookAt:=xlPart) Is Nothing Then
If Not wsTarget.ListObjects(1).HeaderRowRange.Cells(1, indTarCol).Find("Status", LookIn:=xlValues, LookAt:=xlPart) Is Nothing Then
Else
'This If statement prevents wasted time from checking empty cells
If rngTarget.Cells(indTarRow, indTarCol) <> "" Then
Debug.Print "Checking Row " & indTarRow & " of " & indTarRows
'rngTarget.Cells(indTarRow, indTarCol).Select
If rngTarget.Cells(indTarRow, indTarCol).Interior.Color = Subsequent_Color Then
flDontDelete = False 'We found a Subsequent duplicate
'As long as we get to the end of this row and find no Unique items or First duplicates, we can delete the row
Else
flDontDelete = True 'If it's not a Subsequent duplicate, it must be a Unique item or a First duplicate
GoTo MoveAlong: 'No sense searching the rest of this row, bail out here
End If
End If
End If
End If
Next indTarCol
MoveAlong: 'We either got to the end of the row, or bailed out on a row that we know we won't delete
If flDontDelete = False Then 'row contained only Subsequent duplicates. IT'S OUTA HERE!
Debug.Print "Deleting Row " & indTarRow & " of " & indTarRows
Application.ScreenUpdating = True
'rngTarget.Rows(indTarRow).Select 'used for watching
rngTarget.Rows(indTarRow).Delete
indRowsDeleted = indRowsDeleted + 1
Application.ScreenUpdating = False
End If
Next indTarRow
Application.ScreenUpdating = True
Debug.Print "Finished SKMMainRIDDups: " & Now() & " Deleted " & indRowsDeleted & " rows of " & indTarRows
End Sub
'####################################################
Sub MiscHighlightDup(rngTarget As Range, First_Color As Variant, Subsequent_Color As Variant)
'rngTarget should be a single column
'Colors the first instance of a duplicate datum, and all subsequent instances with a different color
Dim tarCell As Range
'initially tried to make it JUST the populated range.
Dim rngFoundPop As Range 'Couldn't get that to work, so it's the whole range, less the parts we've already searched, gets smaller as we go!
Dim rngFoundDup As Range 'the range of duplicates
Dim indTarRows As Long
Debug.Print "Starting MiscHighlightDup for column " & rngTarget.Column & ": " & Now()
indTarRows = rngTarget.Rows.Count
Set rngFoundPop = rngTarget
'rngFoundPop.Select 'used for watching
For Each tarCell In rngTarget
If tarCell.Interior.ColorIndex = xlNone And tarCell.Value <> "" Then 'IsEmpty(tarCell) = False Then
'tarCell.Select 'used for watching
'Debug.Print "Analyzing Row " & tarCell.Row & " Of " & indTarRows 'Clutters the immediate window
Set rngFoundDup = MiscFind_Range(tarCell.Text, rngFoundPop, xlValues, xlWhole, True)
If Not rngFoundDup Is Nothing Then
If rngFoundDup.Address <> tarCell.Address Then
Debug.Print tarCell.Text & " has duplicates. First location: Column " & tarCell.Column & ", Row " & tarCell.Row
'rngFoundDup.Select 'used for watching
rngFoundDup.Interior.Color = Subsequent_Color
tarCell.Interior.Color = First_Color
End If
End If
End If
Set rngFoundPop = rngTarget.Range(Cells(tarCell.Row, 1), Cells(indTarRows, 1)) 'resizes the seached range so that it gets smaller (and we get faster!) as we go
'probably could be resized even more agressively, but too complicated for me right now
'rngFoundPop.Select 'used for watching
Next tarCell
Debug.Print "Finishing MiscHighlightDup for column " & rngTarget.Column & ": " & Now()
End Sub
'####################################################
Function MiscFind_Range(Find_Item As Variant, _
Search_Range As Range, _
Optional LookIn As Variant, _
Optional LookAt As Variant, _
Optional MatchCase As Boolean) As Range
'returns a possibly non-consecutive range containing all found items
'borrowed from http://www.ozgrid.com/forum/showthread.php?t=27240
Dim c As Range
Dim FirstAddress As String
If IsMissing(LookIn) Then LookIn = xlValues 'xlFormulas
If IsMissing(LookAt) Then LookAt = xlPart 'xlWhole
If IsMissing(MatchCase) Then MatchCase = False
With Search_Range
Set c = .Find( _
What:=Find_Item, _
LookIn:=LookIn, _
LookAt:=LookAt, _
SearchOrder:=xlByRows, _
SearchDirection:=xlNext, _
MatchCase:=MatchCase, _
SearchFormat:=False)
If Not c Is Nothing Then
Set MiscFind_Range = c
FirstAddress = c.Address
Do
Set MiscFind_Range = Union(MiscFind_Range, c)
Set c = .FindNext(c)
Loop While Not c Is Nothing And c.Address <> FirstAddress
End If
End With
End Function
What I need to be able to do is search selected columns and identify duplicate data, differentiating between first instance and subsequent instances. Then I need to delete each row that contains ONLY subsequent duplicate data. If a row contains even one unique or first instance item, it is NOT to be deleted.
I have written a series of subroutines to do this, and they mostly work. It's just super slow. Can you identify some performance killers and help me speed this up?
I have uploaded a half-size version of the spreadsheet if you want to play with it. The full version is too big! 10552
On this half-size version with 3727 rows, the whole routine takes about 20 minutes and deletes 2549 rows.
Complete code is below:
Sub SKMMainRPurgeDups() 'Main wrapper for finding duplicates in each column and then deleting rows containing only Subsequent duplicates
Dim wsTarget As Worksheet
Dim strStarTime As String
Dim strStopTime As String
strStartTime = Now()
Set wsTarget = ActiveSheet
SKMMainRIDDups wsTarget, vbGreen, vbRed
SKMMainRDeleteDups wsTarget, vbGreen, vbRed
wsTarget.ListObjects(1).DataBodyRange.Interior.ColorIndex = xlNone
strStopTime = Now()
Debug.Print "Started at " & strStartTime & "; finished at " & strStopTime
Beep 'since I'm probably not paying attention....
End Sub
'####################################################
Sub SKMMainRIDDups(wsTarget As Worksheet, First_Color As Variant, Subsequent_Color As Variant)
'Identifies duplicates in each column
Dim indTarCol As Long
Dim indTarCols As Long
Debug.Print "Starting SKMMainRIDDups: " & Now()
Application.ScreenUpdating = False
Set wsTarget = ActiveSheet
indTarCols = wsTarget.ListObjects(1).ListColumns.Count
'This For statement and the following two If statements limit my searching to specific columns. I don't care about duplicates in other columns.
For indTarCol = 5 To indTarCols
If Not wsTarget.ListObjects(1).HeaderRowRange.Cells(1, indTarCol).Find("tbl", LookIn:=xlValues, LookAt:=xlPart) Is Nothing Then
If Not wsTarget.ListObjects(1).HeaderRowRange.Cells(1, indTarCol).Find("Status", LookIn:=xlValues, LookAt:=xlPart) Is Nothing Then
Else
'wsTarget.ListObjects(1).ListColumns(indTarCol).DataBodyRange.Select 'for watching
MiscHighlightDup wsTarget.ListObjects(1).ListColumns(indTarCol).DataBodyRange, First_Color, Subsequent_Color
End If
End If
Next indTarCol
Application.ScreenUpdating = True
Debug.Print "Finished SKMMainRIDDups: " & Now()
End Sub
'####################################################
Sub SKMMainRDeleteDups(wsTarget As Worksheet, First_Color As Variant, Subsequent_Color As Variant)
'Deletes rows containing only subsequent duplicates, as determined by SKMMainRIDDups
Dim rngTarget As Range
Dim tarCell As Range
Dim indTarRow As Long 'row index in Target
Dim indTarCol As Long 'column index in Target
Dim indTarRows As Long 'number of rows in Target
Dim indTarCols As Long 'number of columns in Target
Dim flDontDelete As Boolean
Dim indRowsDeleted As Long 'statistics
Debug.Print "Starting SKMMainRDeleteDups: " & Now()
Application.ScreenUpdating = False
Set rngTarget = wsTarget.ListObjects(1).DataBodyRange
indTarRows = rngTarget.Rows.Count
indTarCols = rngTarget.Columns.Count
indRowsDeleted = 0
For indTarRow = indTarRows To 1 Step (-1) 'To delete from Table need to start at bottom
flDontDelete = False 'we're going to delete this row unless we find a good reason not to!
'This For statement and the following two If statements limit my searching to specific columns. I don't care about duplicates in other columns.
For indTarCol = 5 To indTarCols
If Not wsTarget.ListObjects(1).HeaderRowRange.Cells(1, indTarCol).Find("tbl", LookIn:=xlValues, LookAt:=xlPart) Is Nothing Then
If Not wsTarget.ListObjects(1).HeaderRowRange.Cells(1, indTarCol).Find("Status", LookIn:=xlValues, LookAt:=xlPart) Is Nothing Then
Else
'This If statement prevents wasted time from checking empty cells
If rngTarget.Cells(indTarRow, indTarCol) <> "" Then
Debug.Print "Checking Row " & indTarRow & " of " & indTarRows
'rngTarget.Cells(indTarRow, indTarCol).Select
If rngTarget.Cells(indTarRow, indTarCol).Interior.Color = Subsequent_Color Then
flDontDelete = False 'We found a Subsequent duplicate
'As long as we get to the end of this row and find no Unique items or First duplicates, we can delete the row
Else
flDontDelete = True 'If it's not a Subsequent duplicate, it must be a Unique item or a First duplicate
GoTo MoveAlong: 'No sense searching the rest of this row, bail out here
End If
End If
End If
End If
Next indTarCol
MoveAlong: 'We either got to the end of the row, or bailed out on a row that we know we won't delete
If flDontDelete = False Then 'row contained only Subsequent duplicates. IT'S OUTA HERE!
Debug.Print "Deleting Row " & indTarRow & " of " & indTarRows
Application.ScreenUpdating = True
'rngTarget.Rows(indTarRow).Select 'used for watching
rngTarget.Rows(indTarRow).Delete
indRowsDeleted = indRowsDeleted + 1
Application.ScreenUpdating = False
End If
Next indTarRow
Application.ScreenUpdating = True
Debug.Print "Finished SKMMainRIDDups: " & Now() & " Deleted " & indRowsDeleted & " rows of " & indTarRows
End Sub
'####################################################
Sub MiscHighlightDup(rngTarget As Range, First_Color As Variant, Subsequent_Color As Variant)
'rngTarget should be a single column
'Colors the first instance of a duplicate datum, and all subsequent instances with a different color
Dim tarCell As Range
'initially tried to make it JUST the populated range.
Dim rngFoundPop As Range 'Couldn't get that to work, so it's the whole range, less the parts we've already searched, gets smaller as we go!
Dim rngFoundDup As Range 'the range of duplicates
Dim indTarRows As Long
Debug.Print "Starting MiscHighlightDup for column " & rngTarget.Column & ": " & Now()
indTarRows = rngTarget.Rows.Count
Set rngFoundPop = rngTarget
'rngFoundPop.Select 'used for watching
For Each tarCell In rngTarget
If tarCell.Interior.ColorIndex = xlNone And tarCell.Value <> "" Then 'IsEmpty(tarCell) = False Then
'tarCell.Select 'used for watching
'Debug.Print "Analyzing Row " & tarCell.Row & " Of " & indTarRows 'Clutters the immediate window
Set rngFoundDup = MiscFind_Range(tarCell.Text, rngFoundPop, xlValues, xlWhole, True)
If Not rngFoundDup Is Nothing Then
If rngFoundDup.Address <> tarCell.Address Then
Debug.Print tarCell.Text & " has duplicates. First location: Column " & tarCell.Column & ", Row " & tarCell.Row
'rngFoundDup.Select 'used for watching
rngFoundDup.Interior.Color = Subsequent_Color
tarCell.Interior.Color = First_Color
End If
End If
End If
Set rngFoundPop = rngTarget.Range(Cells(tarCell.Row, 1), Cells(indTarRows, 1)) 'resizes the seached range so that it gets smaller (and we get faster!) as we go
'probably could be resized even more agressively, but too complicated for me right now
'rngFoundPop.Select 'used for watching
Next tarCell
Debug.Print "Finishing MiscHighlightDup for column " & rngTarget.Column & ": " & Now()
End Sub
'####################################################
Function MiscFind_Range(Find_Item As Variant, _
Search_Range As Range, _
Optional LookIn As Variant, _
Optional LookAt As Variant, _
Optional MatchCase As Boolean) As Range
'returns a possibly non-consecutive range containing all found items
'borrowed from http://www.ozgrid.com/forum/showthread.php?t=27240
Dim c As Range
Dim FirstAddress As String
If IsMissing(LookIn) Then LookIn = xlValues 'xlFormulas
If IsMissing(LookAt) Then LookAt = xlPart 'xlWhole
If IsMissing(MatchCase) Then MatchCase = False
With Search_Range
Set c = .Find( _
What:=Find_Item, _
LookIn:=LookIn, _
LookAt:=LookAt, _
SearchOrder:=xlByRows, _
SearchDirection:=xlNext, _
MatchCase:=MatchCase, _
SearchFormat:=False)
If Not c Is Nothing Then
Set MiscFind_Range = c
FirstAddress = c.Address
Do
Set MiscFind_Range = Union(MiscFind_Range, c)
Set c = .FindNext(c)
Loop While Not c Is Nothing And c.Address <> FirstAddress
End If
End With
End Function