PDA

View Full Version : List all capitalized word from documents



reppa
01-05-2016, 01:49 AM
I found this vba-code on net and I've tried to adjust so it only search for words with capitalized first letter. My problem is that I don't even can get it to list all words so it match case from the orginal documents.

I have tried to
SingleWord = Trim(LCase(aWord)) with
SingleWord = Trim(aWord)

Is there anyone who can help me with this issue?

Here is the code;



Option Explicit



Function GetFolder(Optional Title As String = "Select a Folder", Optional RootFolder As Variant) As String
On Error Resume Next
GetFolder = CreateObject("Shell.Application").BrowseForFolder(0, Title, 0, RootFolder).Items.Item.Path
End Function


Sub LoopAll()
Dim strPath As String
Dim strFile As String
Dim oCurDoc As Document
Dim oDoc As Document
Dim oTempDoc As Document
Dim Ans As String


'Word Breaker variables
Dim TotalWords As Long
Dim SingleWord As String 'Raw word pulled from doc
Const MaxWords = 9000 'Maximum unique words allowed
Dim Words(MaxWords) As String 'Array to hold unique words
Dim Freq(MaxWords) As Integer 'Frequency counter for Unique Words
Dim WordNum As Integer 'Number of unique words
Dim ByFreq As Boolean 'Flag for sorting order
Dim ttlwds As Long 'Total words in the document
Dim Found As Boolean 'Temporary flag
Dim j, k, l, Temp As Integer 'Temporary variables
Dim tword As String '
Dim thisDoc As String
Dim Excludes As String 'Words to be excluded
Dim aWord As Range

On Error GoTo ErrHandler


Set oCurDoc = Documents.Add
'strPath = "C:\Word\" ' note the trailing \
'strPath = "D:\Data\TestFolder\" ' note the trailing \
strPath = InputBox$("Enter Path (ex: D:\data\testfolder\ - note trailing \)", "Data Path", "D:\data\testfolder\")
'strPath = GetFolder(Title:="Find a Folder", RootFolder:=&H11) & ""


'strFile = Dir(strPath & "*.doc") 'word docs only
'strFile = Dir(strPath & "*.*") 'scan all files
strFile = Dir(strPath & InputBox$("Enter File Extensions to scan", "File types", "*.docx"))


' Set up excluded words ... edit the list below.
Excludes = "[a][also][an][and][are][as][at][be][by]" & vbCrLf
Excludes = Excludes & "[for][if][in][is][it][its][of][on][or][so]" & vbCrLf
Excludes = Excludes & "[then][there][them][the][this][that][to][you]" & vbCrLf
Excludes = Excludes & InputBox$("The following words are excluded: " & Excludes & _
".Enter any other words that you wish to exclude, surrounding each word with [ ].", _
"Excluded Words", "")


' Find out how to sort
ByFreq = True
Ans = InputBox$("Sort by WORD or by FREQ?", "Sort order", "WORD")
If Ans = "" Then
Exit Sub
End If


If UCase(Ans) = "WORD" Then
ByFreq = False
End If


System.Cursor = wdCursorWait
Application.ScreenUpdating = False


Do While Not strFile = ""
' Open document
Set oDoc = Documents.Open(FileName:=strPath & strFile, AddToRecentFiles:=False)
thisDoc = oDoc.Name


'~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~
'This code compiles a sorted table showing the frequency with which
'any given word appears in a document. Words not required for inclusion in
'the count can be excluded.
'Adapted from code by macropod on WOPR.com by Chalmers Davis
'20030820


Selection.HomeKey Unit:=wdStory
WordNum = 0
ttlwds = ActiveDocument.Words.Count
TotalWords = ActiveDocument.Words.Count
Selection.Range.Case = wdTitleWord

' Control the repeat
For Each aWord In ActiveDocument.Words

SingleWord = Trim(LCase(aWord))
' SingleWord = Trim(aWord)
If (Left(SingleWord, 1) = "'") And (Len(SingleWord) > 1) Then
SingleWord = Mid(SingleWord, 2) 'trash that leading apostrophe
If Right(SingleWord, 1) = "'" Then 'drop any trailing one, too.
SingleWord = Left(SingleWord, Len(SingleWord) - 1)
End If
Else
If SingleWord < "a" Or SingleWord > "z" Then SingleWord = "" 'Out of range?
End If

If InStr(Excludes, "[" & SingleWord & "]") Then SingleWord = "" 'On exclude list?

If Len(SingleWord) > 0 Then
Found = False

For j = 1 To WordNum
If Words(j) = SingleWord Then
Freq(j) = Freq(j) + 1
Found = True
Exit For
End If
Next j


If Not Found Then
WordNum = WordNum + 1
Words(WordNum) = SingleWord
Freq(WordNum) = 1
End If

If WordNum > MaxWords - 1 Then
MsgBox "The maximum array size has been exceeded. Increase maxwords.", vbExclamation
Exit For
End If


End If
ttlwds = ttlwds - 1
StatusBar = "Remaining: " & ttlwds & " Unique: " & WordNum
Next aWord


' Now sort it into word order
For j = 1 To WordNum - 1
k = j

For l = j + 1 To WordNum
If (Not ByFreq And Words(l) < Words(k)) Or (ByFreq And Freq(l) > Freq(k)) Then k = l
Next l


If k <> j Then
tword = Words(j)
Words(j) = Words(k)
Words(k) = tword
Temp = Freq(j)
Freq(j) = Freq(k)
Freq(k) = Temp
End If


StatusBar = "Sorting: " & WordNum - j


Next j


' Close document without saving it
oDoc.Close SaveChanges:=wdDoNotSaveChanges


' Now write out the results
Set oTempDoc = Documents.Add ' write results to this document
Selection.ParagraphFormat.TabStops.ClearAll


With Selection
For j = 1 To WordNum
.TypeText Text:=Words(j) & vbTab & Trim(Str(Freq(j))) & vbCrLf
Next j
End With


oTempDoc.Range.Select
Selection.ConvertToTable
Selection.Collapse wdCollapseStart


With oTempDoc.Tables(1)
.Rows.Add BeforeRow:=Selection.Rows(1)


'Insert filename in first row
.Cell(1, 1).Range.InsertBefore "Title"
.Cell(1, 2).Range.InsertBefore thisDoc
.Rows.Add


.Range.ParagraphFormat.Alignment = wdAlignParagraphLeft


.Rows.Add
.Cell(ActiveDocument.Tables(1).Rows.Count, 1).Range.InsertBefore _
"Total words in Document"
.Cell(ActiveDocument.Tables(1).Rows.Count, 2).Range.InsertBefore TotalWords
.Rows.Add
.Cell(ActiveDocument.Tables(1).Rows.Count, 1).Range.InsertBefore _
"No. of different words"
.Cell(ActiveDocument.Tables(1).Rows.Count, 2).Range.InsertBefore _
Trim(Str(WordNum))
'Adjust column withs with AutoFit
.Columns.AutoFit
.Range.Copy
End With

'Run Spell Checker
'For i = ActiveDocument.SpellingErrors.Count To 1 Step -1
' SendKeys "%c"
' ActiveDocument.SpellingErrors(i).CheckSpelling
'Next i
oCurDoc.Activate
Selection.EndKey Unit:=wdStory
Selection.Paste
Selection.InsertBreak Type:=wdPageBreak
oTempDoc.Close SaveChanges:=wdDoNotSaveChanges
'~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~


strFile = Dir
Loop


ExitHandler:
' Cleaning up
Selection.HomeKey wdStory
System.Cursor = wdCursorNormal
Application.ScreenUpdating = True
Set aWord = Nothing
Set oCurDoc = Nothing
Set oTempDoc = Nothing
Set oDoc = Nothing
Exit Sub


ErrHandler:
MsgBox Err.Description, vbExclamation
Resume ExitHandler
End Sub

gmayor
01-05-2016, 03:12 AM
Rather than try and debug someone else's code the following will extract all the unique words beginning with a capital letter to a new document:

Option Explicit

Sub FindWords()
Dim oRng As Range
Dim Coll As New Collection
Dim Arr() As Variant
Dim oDoc As Document
Dim i As Long
Set oRng = ActiveDocument.Range
With oRng.Find
Do While .Execute(FindText:="<[A-Z][a-z]{1,}>", MatchWildcards:=True)
Coll.Add oRng.Text
oRng.Collapse 0
Loop
End With
Arr = toArray(Coll)
QuickSort Arr
Set oDoc = Documents.Add
For i = LBound(Arr) To UBound(Arr)
Select Case i
Case LBound(Arr)
oDoc.Range.InsertAfter Arr(i)
If i < UBound(Arr) Then oDoc.Range.InsertAfter vbCr
Case Else
If Not Arr(i) = Arr(i - 1) Then
oDoc.Range.InsertAfter Arr(i)
If i < UBound(Arr) Then oDoc.Range.InsertAfter vbCr
End If
End Select
Next i
lbl_Exit:
Exit Sub
End Sub
Private Function toArray(ByVal Coll As Collection) As Variant
Dim Arr() As Variant
Dim i As Long
ReDim Arr(1 To Coll.Count) As Variant
For i = 1 To Coll.Count
Arr(i) = Coll(i)
Next
toArray = Arr
lbl_Exit:
Exit Function
End Function

Private Sub QuickSort(vArray As Variant, _
Optional lng_Low As Long, _
Optional lng_High As Long)
Dim vPivot As Variant
Dim vTmp_Swap As Variant
Dim tmp_Low As Long
Dim tmp_High As Long

If lng_High = 0 Then
lng_Low = LBound(vArray)
lng_High = UBound(vArray)
End If

tmp_Low = lng_Low
tmp_High = lng_High
vPivot = vArray((lng_Low + lng_High) \ 2)
While (tmp_Low <= tmp_High)
While (vArray(tmp_Low) < vPivot And tmp_Low < lng_High)
tmp_Low = tmp_Low + 1
Wend
While (vPivot < vArray(tmp_High) And tmp_High > lng_Low)
tmp_High = tmp_High - 1
Wend
If (tmp_Low <= tmp_High) Then
vTmp_Swap = vArray(tmp_Low)
vArray(tmp_Low) = vArray(tmp_High)
vArray(tmp_High) = vTmp_Swap
tmp_Low = tmp_Low + 1
tmp_High = tmp_High - 1
End If
Wend
If (lng_Low < tmp_High) Then QuickSort vArray, lng_Low, tmp_High
If (tmp_Low < lng_High) Then QuickSort vArray, tmp_Low, lng_High
lbl_Exit:
Exit Sub
End Sub

reppa
01-05-2016, 04:43 AM
Hi gmayor!


You saved my day! VBA is not my strength, so I tried to adjust/debug with my minor competence.


If you wrote this script today - am I really impressed.


Thank you very much!

gmayor
01-05-2016, 04:54 AM
The two processes are standard processes for converting a collection to an array and sorting an array copied from my collection of functions. The rest is just a simple range search adding what you find that matches to a collection and then writing the unique values from the sorted array to a new document. Glad it did the job for you.

reppa
01-05-2016, 03:37 PM
Is there a way to expand
<[A-Z][a-z]{1,}> to look for one, two and up to three words in same sentence?

I've tried
[!. ]<[A-Z][a-z]{1;}>?<[A-Z][a-z]{1;}>?<[A-Z][a-z]{1;}> - but this will return words before "." and paragraphs.

e.g.


Lorem Ipsum dolor sit amet, consectetur adipiscing elit. Mauris lobortis, Quam At Dignissim mollis, mi urna convallis tellus, ac malesuada lacus metus ut lacus. Nullam Quis ipsum vel diam maximus ultricies et gravida dolor.

..should give;
Lorem Ipsum
Mauris
Quam At Dignissim
Nullam Quis

or should I use another vba-syntax?

-r

gmayor
01-05-2016, 11:10 PM
The following should do the job


Sub FindWords()
Dim oRng As Range, oFound As Range
Dim Coll As New Collection
Dim Arr() As Variant
Dim oDoc As Document
Dim i As Long
Set oRng = ActiveDocument.Range
With oRng.Find
Do While .Execute(FindText:="<[A-Z][a-z]{1,}>", MatchWildcards:=True)
If oRng.Words(1).Next.Characters(1).Case = wdUpperCase Then
oRng.End = oRng.Words(1).Next.End
oRng.MoveEndWhile "abcdefghijklmnopqrstuvwxyz"
If oRng.Words(2).Next.Characters(1).Case = wdUpperCase Then
oRng.End = oRng.Words(2).Next.End
oRng.MoveEndWhile "abcdefghijklmnopqrstuvwxyz"
If oRng.Words(3).Next.Characters(1).Case = wdUpperCase Then
oRng.End = oRng.Words(3).Next.End
oRng.MoveEndWhile "abcdefghijklmnopqrstuvwxyz"
If oRng.Words(4).Next.Characters(1).Case = wdUpperCase Then
oRng.End = oRng.Words(4).Next.End
oRng.MoveEndWhile "abcdefghijklmnopqrstuvwxyz"
End If
End If
End If
End If
'MsgBox oRng.Text
Coll.Add oRng.Text
oRng.Collapse 0
Loop
End With
Arr = toArray(Coll)
QuickSort Arr
Set oDoc = Documents.Add
For i = LBound(Arr) To UBound(Arr)
Select Case i
Case LBound(Arr)
oDoc.Range.InsertAfter Arr(i)
If i < UBound(Arr) Then oDoc.Range.InsertAfter vbCr
Case Else
If Not Arr(i) = Arr(i - 1) Then
oDoc.Range.InsertAfter Arr(i)
If i < UBound(Arr) Then oDoc.Range.InsertAfter vbCr
End If
End Select
Next i
lbl_Exit:
Exit Sub
End Sub
The rest of the code is the same. If there are more than 5 consecutive words with initial caps in the string you will have to modify the code to accommodate them.