mlarsen
05-15-2008, 09:28 AM
Hello,
I have code here that works perfectly fine. It searches through a folder and subfolders for any html or htm file, then searches each file it finds for any hrefs and prints them to the spreadsheet. I am having issues with some though. It seems to find the hrefs that are on 1 line in the source code, but the ones where there are multiple hrefs within a <P></P> tage its not grabbing. Not sure what the problem is considering in the code I am searching for ANY instance of href="
I have included my code below...
Function GetURLAddress(ByVal TextToSearch) As String
Dim Start, Length
Start = InStr(1, TextToSearch, "href=", vbTextCompare) + 6
Length = InStr(1, TextToSearch, """ ", vbTextCompare) - Start
If Length < 0 Then
Length = InStr(1, TextToSearch, "'>", vbTextCompare) - Start
End If
If Length < 0 Then
GetURLAddress = ""
Else
GetURLAddress = LTrim(RTrim(Mid(TextToSearch, Start, Length)))
End If
End Function
Private Function GetHrefs(ByVal html, ByVal strFilex, ByVal strTitlex)
Dim re, matches, match, d, uri, name, r As Long, c As Range, Lrow As Long
Dim saveLink
Dim iRet
saveLink = False
Set re = CreateObject("vbscript.regexp")
re.Pattern = "<a\s+.*?href=[""\']?([^""\' >]*)[""\']?[^>]*>(.*?)<\/a>"
*do you think this is where the issue is?*
re.IgnoreCase = True
re.MultiLine = True
re.Global = True
Set matches = re.Execute(html)
For Each match In matches
iRet = InspectLink(GetURLAddress(match))
If (iRet > 0) Then
Cells(Globalindx, 2) = strFilex
Cells(Globalindx, 3) = strTitlex
Cells(Globalindx, 4) = GetURLTitle(match)
Cells(Globalindx, 5) = GetURLAddress(match)
Cells(Globalindx, 6) = GetType(iRet)
Globalindx = Globalindx + 1
End If
Next
Set matches = Nothing
Set re = Nothing
End Function
I have code here that works perfectly fine. It searches through a folder and subfolders for any html or htm file, then searches each file it finds for any hrefs and prints them to the spreadsheet. I am having issues with some though. It seems to find the hrefs that are on 1 line in the source code, but the ones where there are multiple hrefs within a <P></P> tage its not grabbing. Not sure what the problem is considering in the code I am searching for ANY instance of href="
I have included my code below...
Function GetURLAddress(ByVal TextToSearch) As String
Dim Start, Length
Start = InStr(1, TextToSearch, "href=", vbTextCompare) + 6
Length = InStr(1, TextToSearch, """ ", vbTextCompare) - Start
If Length < 0 Then
Length = InStr(1, TextToSearch, "'>", vbTextCompare) - Start
End If
If Length < 0 Then
GetURLAddress = ""
Else
GetURLAddress = LTrim(RTrim(Mid(TextToSearch, Start, Length)))
End If
End Function
Private Function GetHrefs(ByVal html, ByVal strFilex, ByVal strTitlex)
Dim re, matches, match, d, uri, name, r As Long, c As Range, Lrow As Long
Dim saveLink
Dim iRet
saveLink = False
Set re = CreateObject("vbscript.regexp")
re.Pattern = "<a\s+.*?href=[""\']?([^""\' >]*)[""\']?[^>]*>(.*?)<\/a>"
*do you think this is where the issue is?*
re.IgnoreCase = True
re.MultiLine = True
re.Global = True
Set matches = re.Execute(html)
For Each match In matches
iRet = InspectLink(GetURLAddress(match))
If (iRet > 0) Then
Cells(Globalindx, 2) = strFilex
Cells(Globalindx, 3) = strTitlex
Cells(Globalindx, 4) = GetURLTitle(match)
Cells(Globalindx, 5) = GetURLAddress(match)
Cells(Globalindx, 6) = GetType(iRet)
Globalindx = Globalindx + 1
End If
Next
Set matches = Nothing
Set re = Nothing
End Function