Consulting

Results 1 to 12 of 12

Thread: Insert a function to delete any words with less than 3 characters in a VBA code

  1. #1

    Question Insert a function to delete any words with less than 3 characters in a VBA code

    Hello,

    I'm a VBA newbie, so please with me while I try to explain my problem.

    I'm a marketing analyst, and found recently a VBA code online that helps me greatly in my work. This code regroup the most repeated string of words in a text and count the most repeated (I unfortunately forgot who did this amazing code, but I'm grateful for them).

    It works well, but I want to tweak it a little: I would like to delete all words with less than 3 characters before the function is run, for it to regroup only words with more than 4 characters. I want it to be included in my existing code, for it to run in one macro only: I want to share this VBA with my colleagues and, since they are not excel tech-savy, running one macro will simplify . I tried to add existing functions that do the trick to my existing code, but unfortunately I always run with errors, and I'm not good enough to change it.

    So, here is my original code, that regroups all repetitive string of words:

    Option Explicit
    
    Sub PHRASES_TEST_1()
    '1. Add reference to "Microsoft VBScript Regular Expressions 5.5" (you need to do it once only):
    '   In Visual Basic Editor menu, select Tools –> References, then select Microsoft VBScript Regular Expressions 5.5, then click OK.
    '2. Data must be in column A, start at A1
    '3. Run Word_Phrase_Frequency_v1
    
    '--- CHANGE sNumber & xPattern VALUE TO SUIT -----------------------------------
    Dim lastRow As Long
        Dim myRange As Range
    '   Find lastRow in column A
        lastRow = Cells(Rows.Count, "A").End(xlUp).Row
    
    '   Set range to look at
        Set myRange = Range("A4:A" & lastRow)
    Const sNumber As String = "1,2,3,4,5"  '"1,2,3"
    'sNumber = "1"  will generate 1 word frequency list
    'sNumber = "1,2,3"  will generate 1 word, 2 word & 3 word frequency list
    Const xPattern As String = "A-Z0-9_'"
    'define the word characters, the above pattern will include letter, number, underscore & apostrophe as word character
    'word with apostrophe such as "you're" counts as one word.
    'word with underscore such as "aa_bb" counts as one word.
    Const xCol As String = "C:ZZ" 'columns to clear
    Dim i As Long, j As Long
    Dim txa As String
    Dim z, t
    t = Timer
    Application.ScreenUpdating = False
    Range(xCol).Clear
    'if there are errors, remove them
    On Error Resume Next
    Range("A:A").SpecialCells(xlCellTypeFormulas, xlErrors).ClearContents
    Range("A:A").SpecialCells(xlConstants, xlErrors).ClearContents
    On Error GoTo 0
    j = Range("A" & Rows.Count).End(xlUp).Row
    If j < 65000 Then
        txa = Join(Application.Transpose(Range("A1", Cells(Rows.Count, "A").End(xlUp))), "")
    Else
        For i = 1 To j Step 65000
        txa = txa & Join(Application.Transpose(Range("A" & i).Resize(65000)), "") & ""
        Next
    End If
    z = Split(sNumber, ",")
    'TO PROCESS
        For i = LBound(z) To UBound(z)
            Call toProcessY(CLng(z(i)), txa, xPattern)
        Next
    Range(xCol).Columns.AutoFit
    Application.ScreenUpdating = True
    Debug.Print "It's done in:  " & Timer - t & " seconds"
    End Sub
    
    Sub toProcessY(n As Long, ByVal tx As String, xP As String)
    'phrase frequency
    Dim regEx As Object, matches As Object, x As Object, d As Object
    Dim i As Long, rc As Long
    Dim va, q
    Set regEx = CreateObject("VBScript.RegExp")
            With regEx
                .Global = True
                .MultiLine = True
                .IgnoreCase = True
            End With
    If n > 1 Then
    regEx.Pattern = "( ){2,}"
    If regEx.Test(tx) Then
               tx = regEx.Replace(tx, "") 'remove excessive space
            End If
    tx = Trim(tx)
    '        regEx.Pattern = "[^A-Z0-9_' ]+"
            regEx.Pattern = "[^" & xP & " ]+" 'exclude xp and space
            If regEx.Test(tx) Then
               tx = regEx.Replace(tx, vbLf) 'replace non words character (excluding space) with new line char (vbLf)
            End If
    tx = Replace(tx, vbLf & "", vbLf & "") 'remove space in the beginning of every line
    End If
    Set d = CreateObject("scripting.dictionary")
        d.CompareMode = vbTextCompare
    '    regEx.Pattern = Trim(WorksheetFunction.Rept("[A-Z0-9_']+ ", n)) 'match n words (the phrase) separated by a space
        regEx.Pattern = Trim(WorksheetFunction.Rept("[" & xP & "]+ ", n)) 'match n words (the phrase) separated by a space
                Set matches = regEx.Execute(tx)
    For Each x In matches
                    d(CStr(x)) = d(CStr(x)) + 1 'get phrase frequency
                Next
    For i = 1 To n - 1
    regEx.Pattern = "^[" & xP & "]+ "
            If regEx.Test(tx) Then
               tx = regEx.Replace(tx, "")   'remove first word in each line to get different combination of n words (phrase)
    '            regEx.Pattern = Trim(WorksheetFunction.Rept("[A-Z0-9_']+ ", n))
                regEx.Pattern = Trim(WorksheetFunction.Rept("[" & xP & "]+ ", n))
    Set matches = regEx.Execute(tx)
    For Each x In matches
                    d(CStr(x)) = d(CStr(x)) + 1     'get phrase frequency
                Next
    End If
    Next
    If d.Count = 0 Then MsgBox "Nothing with " & n & " word phrase found": Exit Sub
    rc = Cells(1, Columns.Count).End(xlToLeft).Column
    'put the result
    With Cells(2, rc + 2).Resize(d.Count, 2)
    Select Case d.Count
        Case Is < 65536 'Transpose function has a limit of 65536 item to process
    .Value = Application.Transpose(Array(d.Keys, d.Items))
    Case Is <= 1048500
    ReDim va(1 To d.Count, 1 To 2)
            i = 0
                For Each q In d.Keys
                    i = i + 1
                    va(i, 1) = q: va(i, 2) = d(q)
                Next
            .Value = va
    Case Else
    MsgBox "Process is canceled, the result is more than 1048500 rows"
    End Select
    .Sort Key1:=.Cells(1, 2), Order1:=xlDescending, Key2:=.Cells(1, 1), Order2:=xlAscending, Header:=xlNo
    End With
    Cells(1, rc + 2) = n & " WORD"
    Cells(1, rc + 3) = "COUNT"
    End Sub
    If you need a base to add the function to delete 3 words under, here is a function I used first and put at the beginning of this code, where it selected specific words to delete before regrouping: and it worked wonderfully!

       '   Replace pas
        myRange.Replace What:=" pas ", Replacement:=" ", LookAt:=xlPart, _
            SearchOrder:=xlByRows, MatchCase:=False, SearchFormat:=False, _
            ReplaceFormat:=False
    A full example is attached.

    Thank you for your help!
    Attached Files Attached Files
    Last edited by Paul_Hossler; 06-01-2022 at 07:11 AM. Reason: Added CODE tags instead of QUOTE tags

  2. #2
    VBAX Contributor rollis13's Avatar
    Joined
    Jun 2013
    Location
    Cordenons
    Posts
    146
    Location
    This could be a quick way to exclude words less than 3 characters.
    In macro "PHRASES_TEST_1" add these 6 lines, here:
    '...
        For i = 1 To j Step 65000
            txa = txa & Join(Application.Transpose(Range("A" & i).Resize(65000)), "") & ""
        Next
    End If
    '--- added -------------------------------------------
    Dim sp, txa3
    sp = Split(txa, " ")
    For i = LBound(sp) To UBound(sp)
        If Len(sp(i)) > 2 Then txa3 = txa3 & (sp(i)) & " "
    Next i
    txa = txa3
    '-----------------------------------------------------
    z = Split(sNumber, ",")
    'TO PROCESS
    '...
    and in macro "toProcessY" change 1 line from:
    '...
    For Each x In matches
        d(CStr(x)) = d(CStr(x)) + 1 'get phrase frequency
    Next
    '...
    to:
    '...
    For Each x In matches
        If Len(CStr(x)) > 2 Then d(CStr(x)) = d(CStr(x)) + 1 'get phrase frequency   '<- changed
    Next
    '...
    PS. these changes will not solve previous inaccuracies, probably will add some more .

  3. #3
    Knowledge Base Approver VBAX Wizard
    Joined
    Apr 2012
    Posts
    5,646
    I'd use this macro instead of Pascale Paquin's macro:

    Sub M_snb()
      sn = UsedRange.Columns(1)
            
      For j = 1 To UBound(sn)
        c00 = c00 & " " & sn(j, 1)
      Next
      sn = Split(Application.Trim(c00))
        
      For j = 0 To UBound(sn)
        If Len(sn(j)) < 3 Then sn(j) = ""
      Next
      c00 = Application.Trim(Join(sn))
      sn = Split(c00)
      ReDim sp(UBound(sn), 14)
        
      With CreateObject("scripting.dictionary")
        For j = 0 To UBound(sn)
          c01 = sn(j)
          st = Filter(sn, c01)
          If UBound(st) > 0 Then .Item(c01 & "~1") = UBound(st) + 1
          For jj = 1 To 4
            If j + jj > UBound(sn) Then Exit For
            c01 = c01 & " " & sn(j + jj)
            st = Split(c00, c01)
            If UBound(st) > 1 Then .Item(c01 & "~" & jj + 1) = UBound(st)
          Next
        Next
           
        For j = 0 To 4
          st = Filter(.keys, "~" & j + 1)
          sp(0, 3 * j) = j + 1 & " WORD"
          sp(0, 3 * j + 1) = "COUNT"
          For jj = 0 To UBound(st)
            sp(jj + 1, 3 * j) = Left(st(jj), Len(st(jj)) - 2)
            sp(jj + 1, 3 * j + 1) = .Item(st(jj))
          Next
        Next
      End With
         
      Cells(1, 20).Resize(UBound(sp), 15) = sp
      For j = 0 To 4
        Cells(2, 20 + 3 * j).CurrentRegion.Sort Cells(2, 20 + 3 * j + 1), 2
      Next
    End Sub
    Last edited by snb; 06-01-2022 at 04:50 AM.

  4. #4
    Thanks for the code Rollis13! Unfortunately, it doesn't exactly do what I want: it does delete the words with less than 3 characters when it considers the most repeated words, but not when it's the most repeated 2-3-4 consecutive words.

    For example, if a quote is like this:

    ''I would like to have bread.''

    I want it to cut it to this:

    'Would like have bread.''

    And do this for all the text, and then tell me what are the most repeated consecutive words. Hope it's more clear!

    And when you say inaccuracies, which one do you see?

  5. #5
    Thanks for the code snb, but unfortunately it doesn't seem to stop running!

  6. #6
    Knowledge Base Approver VBAX Wizard
    Joined
    Apr 2012
    Posts
    5,646
    See the attachment.

    The result will be in column T and further.
    Attached Files Attached Files

  7. #7
    Thank you snb, unfortunately my Excel is too slow or too difficult to get it to run!

    But I found my solution Found this code on another forum:

    Sub remove2letterwords_by_sintek()
    Dim N As Long, i As Long, s As String, keep_these, ary
    Dim wf As WorksheetFunction, j As Long
    keep_these = Array("is", "on", "up")
    Set wf = Application.WorksheetFunction
    N = Cells(Rows.Count, "A").End(xlUp).Row
    For i = 1 To N
    s = Cells(i, 1).Value
    ary = Split(s, " ")
    For j = LBound(ary) To UBound(ary)
    If Len(ary(j)) < 3 Then
    If IsError(Application.Match(ary(j), keep_these, False)) Then ary(j) = ""
    End If
    Next j
    Cells(i, 4).Value = wf.Trim(Join(ary, " "))
    Next i
    End Sub
    And I created a Main Sub to run all of them one after the other

    Sub Main()

    Call remove2letterwords_by_sintek
    Call PHRASES_TEST_1

    End Sub
    Thanks for your help!

  8. #8
    VBAX Contributor rollis13's Avatar
    Joined
    Jun 2013
    Location
    Cordenons
    Posts
    146
    Location
    Not sure how you used my suggestions but this is my screenshot for your example:

    MacroTesting.jpg

  9. #9
    Quote Originally Posted by rollis13 View Post
    Not sure how you used my suggestions but this is my screenshot for your example:

    MacroTesting.jpg
    It would have been exactly what I wanted, but Excel wasn't able to run it with what I had unfortunately. It's probably because I used about 4000 lines of multiple sentences in each!

  10. #10
    Knowledge Base Approver VBAX Wizard
    Joined
    Apr 2012
    Posts
    5,646
    Did you test the file I uploaded ?

    Put your 4000 lines in column A in the attachment below and please test it again.
    Attached Files Attached Files

  11. #11
    Moderator VBAX Wizard Aussiebear's Avatar
    Joined
    Dec 2005
    Location
    Queensland
    Posts
    5,064
    Location
    Quote Originally Posted by snb View Post
    Did you test the file I uploaded ?
    Come on snb, Browniiesx has already suggest that his version of Excel had trouble running your code, for what ever reason.
    Remember To Do the Following....
    Use [Code].... [/Code] tags when posting code to the thread.
    Mark your thread as Solved if satisfied by using the Thread Tools options.
    If posting the same issue to another forum please show the link

  12. #12
    Knowledge Base Approver VBAX Wizard
    Joined
    Apr 2012
    Posts
    5,646
    If the macro runs in .14 sec in Ecels 2010, there is no Excel version (even 95) that is not able to run the macro in 2 seconds for whatever reason.

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •