Consulting

Results 1 to 12 of 12

Thread: Code needed - finding duplicate text (not sentences) in Word

  1. #1
    VBAX Regular
    Joined
    Jun 2022
    Posts
    7
    Location

    Code needed - finding duplicate text (not sentences) in Word

    Hi all,

    I hope someone can help me. I have been looking for code that will help me identify where I might have duplicated parts of a sentence, rather than whole paragraphs or sentences. The reason is that in early stages of my PhD I wasn't careful about copying text from different documents, now I have a 60,000 words document but the code I have found online will only highlight duplication when it's a whole sentence (ie marked by punctuation marks).

    Is there any way to do this?

    Many thanks in advance!

    Just to clarify, I have used code that was shared on this forum, unfortunately it's very patchy and doesn't find much apart from quotations, ie (p. 354) for instance.
    Last edited by macropod; 06-24-2022 at 05:23 PM.

  2. #2
    Moderator VBAX Guru Aussiebear's Avatar
    Joined
    Dec 2005
    Location
    Queensland
    Posts
    4,997
    Location
    Quote Originally Posted by ameiro View Post
    I have used code that was shared on this forum, unfortunately it's very patchy and doesn't find much apart from quotations,
    Care to share this "very patchy " code or are we forced to go find the code? If you wish to have help provided, it would be in your best interests to be more specific.
    Remember To Do the Following....
    Use [Code].... [/Code] tags when posting code to the thread.
    Mark your thread as Solved if satisfied by using the Thread Tools options.
    If posting the same issue to another forum please show the link

  3. #3
    VBAX Regular
    Joined
    Jun 2022
    Posts
    7
    Location
    Sorry, I didn't mean to imply the code was patchy, just the results I got. Also, it's hard to be specific when you don't know what details are required.

    Option Explicit
    Sub Sample()
    Dim MyArray() As String
    Dim n As Long, i As Long
    Dim Col As New Collection
    Dim itm
    n = 0
    '> Get all the sentences from the word document in an array
    For i = 1 To ActiveDocument.Sentences.Count
      n = n + 1
      ReDim Preserve MyArray(n)
      MyArray(n) = Trim(ActiveDocument.Sentences(i).Text)
    Next
    '> Sort the array
    SortArray MyArray, 0, UBound(MyArray)
    '> Extract Duplicates
    For i = 1 To UBound(MyArray)
      If i = UBound(MyArray) Then Exit For
      If InStr(1, MyArray(i + 1), MyArray(i), vbTextCompare) Then
        On Error Resume Next
        Col.Add MyArray(i), """" & MyArray(i) & """"
        On Error GoTo 0
      End If
    Next i
    '> Highlight duplicates
    For Each itm In Col
      Selection.Find.ClearFormatting
      Selection.HomeKey wdStory, wdMove
      Selection.Find.Execute itm
      Do Until Selection.Find.Found = False
        Selection.Range.HighlightColorIndex = wdPink
        Selection.Find.Execute
      Loop
    Next
    End Sub
    
    '> Sort the array
    Public Sub SortArray(vArray As Variant, i As Long, j As Long)
    Dim tmp As Variant, tmpSwap As Variant
    Dim ii As Long, jj As Long
    ii = i: jj = j: tmp = vArray((i + j) \ 2)
    While (ii <= jj)
      While (vArray(ii) < tmp And ii < j)
        ii = ii + 1
      Wend
      While (tmp < vArray(jj) And jj > i)
        jj = jj - 1
      Wend
      If (ii <= jj) Then
        tmpSwap = vArray(ii)
        vArray(ii) = vArray(jj): vArray(jj) = tmpSwap
        ii = ii + 1: jj = jj - 1
      End If
    Wend
    If (i < jj) Then SortArray vArray, i, jj
    If (ii < j) Then SortArray vArray, ii, j 
    End Sub

    When I ran it, it did work, but only highlighted random things instead of the repeated sentences that I know are in there.

    I have also tried this:

    Sub FindDuplicateSentences()
    Application.ScreenUpdating = False
    Dim i As Long, RngSrc As Range, RngFnd As Range
    Const Clr As Long = wdBrightGreen
    Dim eTime As Single
    eTime = Timer
    Options.DefaultHighlightColorIndex = Clr
    With ActiveDocument
      With .Range.Find
        .ClearFormatting
        .Replacement.ClearFormatting
        .Forward = True
        .Format = False
        .MatchCase = False
        .MatchWholeWord = False
        .MatchWildcards = False
        .MatchSoundsLike = False
        .MatchAllWordForms = False
        .Execute
      End With
      For i = 1 To .Sentences.Count
        If i Mod 100 = 0 Then DoEvents
        On Error Resume Next
        Set RngSrc = .Sentences(i)
        If RngSrc.HighlightColorIndex <> Clr Then
          Set RngFnd = .Range(.Sentences(i).End, .Range.End)
          If Len(RngSrc.Text) < 256 Then
            With RngFnd.Find
              .Text = RngSrc.Text
              .Replacement.Text = "^&"
              .Replacement.Highlight = True
              .Wrap = wdFindStop
              .Execute Replace:=wdReplaceAll
            End With
          Else
            With RngFnd
              With .Find
                .Text = Left(RngSrc.Text, 255)
                .Wrap = wdFindStop
                .Execute
              End With
              Do While .Find.Found
                If RngSrc.Text = .Duplicate.Text Then
                  RngSrc.HighlightColorIndex = Clr
                  .Duplicate.HighlightColorIndex = Clr
                End If
                .Collapse wdCollapseEnd
                .Find.Execute
              Loop
            End With
          End If
        End If
      Next
    End With
    ' Report time taken. Elapsed time calculation allows for execution to extend past midnight.
    MsgBox "Finished. Elapsed time: " & (Timer - eTime + 86400) Mod 86400 & " seconds."
    Application.ScreenUpdating = True
    End Sub
    Last edited by macropod; 06-24-2022 at 05:24 PM.

  4. #4
    VBAX Regular
    Joined
    Jun 2022
    Posts
    7
    Location
    Sorry, I realise that I completely forgot to say I have Word for Mac version 16.62.

    Thanks again!

  5. #5
    Moderator VBAX Guru Aussiebear's Avatar
    Joined
    Dec 2005
    Location
    Queensland
    Posts
    4,997
    Location
    Maybe try this version
    Option Explicit
    Sub Sample()
    Dim MyArray() As String
    Dim n As Long, i As Long
    Dim Col As New Collection
    Dim itm
    n = 0
    'Get all the sentences from the word document in an array
    For i = 1 To ActiveDocument.Sentences.Count
      n = n + 1
      ReDim Preserve MyArray(n)
      MyArray(n) = Trim(ActiveDocument.Sentences(i).Text)
    Next
    'Sort the array
    SortArray MyArray, 0, UBound(MyArray)
    'Extract Duplicates
    For i = 1 To UBound(MyArray)
      If i = UBound(MyArray) Then Exit For
        If InStr(1, MyArray(i + 1), MyArray(i), vbTextCompare) Then
          On Error Resume Next
          Col.Add MyArray(i), &quot;&quot;&quot;&quot; &amp; MyArray(i) &amp; &quot;&quot;&quot;&quot;
          On Error GoTo 0
        End If
      End If
    Next i
    'Highlight duplicates
    For Each itm In Col
      Selection.Find.ClearFormatting
      Selection.HomeKey wdStory, wdMove
      Selection.Find.Execute itm
      Do Until Selection.Find.Found = False
        Selection.Range.HighlightColorIndex = wdPink
        Selection.Find.Execute
      Loop
    Next
    End Sub
    
    '&gt; Sort the array
    Public Sub SortArray(vArray As Variant, i As Long, j As Long)
    Dim tmp As Variant, tmpSwap As Variant
    Dim ii As Long, jj As Long
    ii = i: jj = j: tmp = vArray((i + j) \ 2)
    While (ii &lt;= jj)
      While (vArray(ii) &lt; tmp And ii &lt; j)
        ii = ii + 1
      Wend
      While (tmp &lt; vArray(jj) And jj &gt; i)
        jj = jj - 1
      Wend
      If (ii &lt;= jj) Then
        tmpSwap = vArray(ii)
        vArray(ii) = vArray(jj): vArray(jj) = tmpSwap
        ii = ii + 1: jj = jj - 1
      End If
    Wend
    If (i &lt; jj) Then 
     SortArray vArray, i, jj
    End If
    If (ii &lt; j) Then 
     SortArray vArray, ii, j 
    End If
    End Sub
    Remember To Do the Following....
    Use [Code].... [/Code] tags when posting code to the thread.
    Mark your thread as Solved if satisfied by using the Thread Tools options.
    If posting the same issue to another forum please show the link

  6. #6
    VBAX Regular
    Joined
    Jun 2022
    Posts
    7
    Location
    Thanks, the one you provided gave ne a syntax error?

  7. #7
    Moderator VBAX Guru Aussiebear's Avatar
    Joined
    Dec 2005
    Location
    Queensland
    Posts
    4,997
    Location
    what was the syntax error?
    Remember To Do the Following....
    Use [Code].... [/Code] tags when posting code to the thread.
    Mark your thread as Solved if satisfied by using the Thread Tools options.
    If posting the same issue to another forum please show the link

  8. #8
    VBAX Regular
    Joined
    Jun 2022
    Posts
    7
    Location
    Quote Originally Posted by Aussiebear View Post
    what was the syntax error?
    That's all it said, it didn't specify anything else.

  9. #9
    Moderator VBAX Guru Aussiebear's Avatar
    Joined
    Dec 2005
    Location
    Queensland
    Posts
    4,997
    Location
    Did it highlight any particular line?
    Remember To Do the Following....
    Use [Code].... [/Code] tags when posting code to the thread.
    Mark your thread as Solved if satisfied by using the Thread Tools options.
    If posting the same issue to another forum please show the link

  10. #10
    VBAX Regular
    Joined
    Jun 2022
    Posts
    7
    Location
    Quote Originally Posted by Aussiebear View Post
    Did it highlight any particular line?
    It didn't, when it ran again however it highlighted things in the text that are not duplicated.

  11. #11
    VBAX Regular
    Joined
    Jun 2022
    Posts
    7
    Location
    It's also the same things that are highlighted, over and over again. However, they're either a page reference such as (p.) Or random text that is not a duplicate. I clicked reset after every try but nothing made a difference.

  12. #12
    Moderator VBAX Master georgiboy's Avatar
    Joined
    Mar 2008
    Location
    Kent, England
    Posts
    1,158
    Location
    Lots of strange symbols in the code above, try with the below - no strange symbols in the code.

    Sub Sample()    Dim MyArray() As String
        Dim n As Long, i As Long
        Dim Col As New Collection
        Dim itm
    
    
        n = 0
        ' Get all the sentences from the word document in an array
        For i = 1 To ActiveDocument.Sentences.Count
            n = n + 1
            ReDim Preserve MyArray(n)
            MyArray(n) = Trim(ActiveDocument.Sentences(i).Text)
        Next
    
    
        ' Sort the array
        SortArray MyArray, 0, UBound(MyArray)
    
    
        ' Extract Duplicates
        For i = 1 To UBound(MyArray)
            If i = UBound(MyArray) Then Exit For
            If InStr(1, MyArray(i + 1), MyArray(i), vbTextCompare) Then
                On Error Resume Next
                Col.Add MyArray(i), """" & MyArray(i) & """"
                On Error GoTo 0
            End If
        Next i
    
    
        ' Highlight duplicates
        For Each itm In Col
            Selection.Find.ClearFormatting
            Selection.HomeKey wdStory, wdMove
            Selection.Find.Execute itm
            Do Until Selection.Find.Found = False
                Selection.Range.HighlightColorIndex = wdPink
                Selection.Find.Execute
            Loop
        Next
    End Sub
    
    
    ' Sort the array
    Public Sub SortArray(vArray As Variant, i As Long, j As Long)
      Dim tmp As Variant, tmpSwap As Variant
      Dim ii As Long, jj As Long
    
    
      ii = i: jj = j: tmp = vArray((i + j) \ 2)
    
    
      While (ii <= jj)
         While (vArray(ii) < tmp And ii < j)
            ii = ii + 1
         Wend
         While (tmp < vArray(jj) And jj > i)
            jj = jj - 1
         Wend
         If (ii <= jj) Then
            tmpSwap = vArray(ii)
            vArray(ii) = vArray(jj): vArray(jj) = tmpSwap
            ii = ii + 1: jj = jj - 1
         End If
      Wend
      If (i < jj) Then SortArray vArray, i, jj
      If (ii < j) Then SortArray vArray, ii, j
    End Sub
    Click here for a guide on how to add code tags
    Click here for a guide on how to mark a thread as solved

    Excel 365, Version 2401, Build 17231.20084

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •