When a Website has data arranged in some type of table format:
...But you would like it to look something like this (In Excel):
Check the source code for Labels to use (and adapt code as needed)
1) Create a string that contains the Headers
2) Create a list of links that contain the individual pages
3) Create a loop that calls the function and loads data for each row based on the Column A value
4) Each website is different - code looks at things like:
a) Class Labels / Values (["my_title">"], ["my_data">""])
b) Close Tags (</div>, </a>)
2) Create a list of links that contain the individual pages
3) Create a loop that calls the function and loads data for each row based on the Column A value
4) Each website is different - code looks at things like:
a) Class Labels / Values (["my_title">"], ["my_data">""])
b) Close Tags (</div>, </a>)
Call the function using this method:
Here is a sample of the code:
(Also Requires: RestringHTMLLinks)
Function WebScrape_ProductPage_FB(MyTab As Variant, MyRow As Variant, MyDumpCol As Variant, MyLinkCol As Variant)
'Enum READYSTATE
' READYSTATE_UNINITIALIZED = 0
' READYSTATE_LOADING = 1
' READYSTATE_LOADED = 2
' READYSTATE_INTERACTIVE = 3
' READYSTATE_COMPLETE = 4
'End Enum
Dim sWebLink As String
Dim iLoop As Long, sQuote As String, sTempA As String, sTempB As String, sTempC As String, sTempD As String, sTempE As String, sTempF As String, sTempG As String
Dim iTempA As Long, iTempB As Long, iTempC As Long, iTempD As Long, iTempE As Long, iTempF As Long, iTempG As Long
Dim iCloseTagA As Long, iStartVal As Long
Dim sCloseAddr As String, sHref As String, sAmp As String, sDelimiter As String, sMarkStart As String
Dim iUboundR As Long, iUboundC As Long, aDump() As Variant
Dim Ws1 As Long, iNextRow As Long
Dim sHeader As String, aSplitMe As Variant
Dim sClassLabel As String, sClassValue As String, sCloseTag As String
Dim aHeaders() As Variant
'OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO
Erase aDump: Erase aHeaders
Ws1 = Worksheets(MyTab).Index
iNextRow = MyRow
sQuote = """"
sWebLink = Worksheets(Ws1).Cells(iNextRow, MyLinkCol) '"https://www.fa
'OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO
'create the markers for the strings:
sClassLabel = "tech_title" & sQuote & ">"
sClassValue = "tech_data" & sQuote & ">"
sCloseTag = "</div>"
sCloseAddr = "</a>"
sHref = "href"
sAmp = " &"
sDelimiter = "|"
sMarkStart = ">"
'OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO
'SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
sHeader = "Product Number^Brand^Application^Uses^Width^Colors^Vertical Repeat^Horizontal Repeat^Railroaded^Fire Codes^Double Rubs^Content^Finish^Backing^Categories^Design Types^Scale^Cleaning Codes^Collection^Date Booked^Book^Country^Additional Product Notes^"
'Split String (Populate Public Array: Row1-Headers)
aSplitMe = Split(sHeader, "^")
'AAAAAAAAAAAAAAAAAAAAAAAAAAAAA
iUboundC = Len(sHeader) - Len(Replace(sHeader, "^", "", , , vbTextCompare))
aHeaders = Range(Worksheets(Ws1).Cells(iNextRow, MyDumpCol).Address, Worksheets(Ws1).Cells(iNextRow, MyDumpCol + iUboundC).Address)
'AAAAAAAAAAAAAAAAAAAAAAAAAAAAA
'Load All non blank values into the Array (Header Position)
For iLoop = LBound(aSplitMe) To UBound(aSplitMe)
If aSplitMe(iLoop) <> "" Then aHeaders(1, iLoop + 1) = aSplitMe(iLoop)
Next iLoop
'===================================================================================================
Worksheets(Ws1).Select
'''GoTo gArrayMemoryCleanupOnly
Set Destination = Range(Worksheets(Ws1).Cells(1, MyDumpCol).Address)
Destination.Resize(UBound(aHeaders, 1), UBound(aHeaders, 2)).Value = aHeaders
'===================================================================================================
'SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
'''GoTo gMemoryCleanup
'to refer to the running copy of Internet Explorer
Dim oWebBrowser As InternetExplorer
'to refer to the HTML document returned
Dim sHTML As HTMLDocument
'open Internet Explorer in memory, and go to website
Set oWebBrowser = New InternetExplorer
oWebBrowser.Visible = False
oWebBrowser.navigate sWebLink '"https://www.fa
'Wait until IE is done loading page
Do While oWebBrowser.READYSTATE <> READYSTATE_COMPLETE
Application.StatusBar = "Trying to go to website ..."
DoEvents
Loop
'show text of HTML document returned
Set sHTML = oWebBrowser.document
sTempA = sHTML.DocumentElement.innerHTML
'remove the carriage returns:
sTempA = Replace(sTempA, Chr(13), "", , , vbTextCompare)
sTempA = Replace(sTempA, Chr(10), "", , , vbTextCompare)
'AAAAAAAAAAAAAAAAAAAAAAAAAAAAA
iUboundR = Len(sTempA) - Len(Replace(sTempA, "src=", "", , , vbTextCompare))
aDump = Range(Worksheets(Ws1).Cells(iNextRow, MyDumpCol).Address, Worksheets(Ws1).Cells(iNextRow, MyDumpCol + iUboundC).Address)
'AAAAAAAAAAAAAAAAAAAAAAAAAAAAA
'Initiate the loop
iTempA = 1
Application.StatusBar = "Reviewing the HTML string"
'Review the HTML String - start over after setting the iTempA variable
gReviewHTMLstring:
'++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
'Provide escape route during testing:
DoEvents
'Create numeric checks for string positions
iTempB = InStr(iTempA, sTempA, sClassLabel, vbTextCompare)
iTempC = InStr(iTempB + 1, sTempA, sCloseTag, vbTextCompare)
iTempD = InStr(iTempC + 1, sTempA, sClassValue, vbTextCompare)
iTempE = InStr(iTempD + 1, sTempA, sCloseTag, vbTextCompare)
'Just in case values:
iCloseTagA = InStr(iTempB, sTempA, sCloseAddr) '"</a>")
iStartVal = InStrRev(sTempA, ">", iCloseTagA - 1, vbTextCompare)
gTestForContinue:
If iTempB = 0 Or iTempC = 0 Or iTempD = 0 Or iTempE = 0 Then GoTo gFinishedWithReview
If iTempC > iTempD Then MsgBox "Error: Mid function not available due to mismatched mid/end values"
If iTempD > iTempE Then MsgBox "Error: Mid function not available due to mismatched mid/end values"
'YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY
'YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY
gExtractData:
'Extract data
'Extract the Label
sTempB = Mid(sTempA, iTempB, iTempC - iTempB)
sTempB = Replace(sTempB, sClassLabel, "", , , vbTextCompare)
'Extract the Value
sTempC = Mid(sTempA, iTempD, iTempE - iTempD)
If InStr(1, sTempC, sHref, vbTextCompare) > 0 Then 'sHref = "href"
If iCloseTagA < iTempE And iStartVal > iTempB Then
sTempC = RestringHTMLLinks(Replace(sTempC, sClassValue, "", , , vbTextCompare), sCloseAddr, sDelimiter, sMarkStart)
Else
sTempC = ""
End If
End If
sTempC = Replace(sTempC, sClassValue, "", , , vbTextCompare)
If InStr(1, sTempC, sAmp, vbTextCompare) > 0 Then sTempC = Replace(sTempC, sAmp, " and", , , vbTextCompare) 'sAmp = "&"
sTempC = Trim(sTempC)
gCheckForMatchingHeader:
iTempG = 0
'check to see if the Label matches a correct Header value
For iLoop = LBound(aDump, 2) To UBound(aDump, 2)
If UCase(sTempB) = UCase(aHeaders(1, iLoop)) Then
iTempG = iLoop
GoTo gMatchMadeDoSomething
End If
Next iLoop
gMatchMadeDoSomething:
'Test if Col Position Defined
If iTempG = 0 Then GoTo gIncrementStartSearchPosition
'populate Array
Application.StatusBar = "Processing a Match: " & iNextRow & " | " & sTempB & "(" & Left(sTempC, 50) & ")"
aDump(1, iTempG) = sTempC
GoTo gIncrementStartSearchPosition:
gIncrementStartSearchPosition:
iTempA = InStr(iTempD + 1, sTempA, sClassLabel, vbTextCompare)
'A:Stop the process - no more matches
If iTempA = 0 Then GoTo gFinishedWithReview
'B: Check the remainder of the string - more matches available
GoTo gReviewHTMLstring
'++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
gFinishedWithReview:
'Dump the results:
'===================================================================================================
Worksheets(Ws1).Select
Set Destination = Range(Worksheets(Ws1).Cells(iNextRow, MyDumpCol).Address)
Destination.Resize(UBound(aDump, 1), UBound(aDump, 2)).Value = aDump
'===================================================================================================
GoTo gMemoryCleanup
gMemoryCleanup:
'(Close down oWebBrowser) / (Reset status bar)
oWebBrowser.Quit
Set oWebBrowser = Nothing
Application.StatusBar = ""
gArrayMemoryCleanupOnly:
'Release Array memory
Erase aDump: Erase aHeaders
MsgBox "Done"
End Function
'Enum READYSTATE
' READYSTATE_UNINITIALIZED = 0
' READYSTATE_LOADING = 1
' READYSTATE_LOADED = 2
' READYSTATE_INTERACTIVE = 3
' READYSTATE_COMPLETE = 4
'End Enum
Dim sWebLink As String
Dim iLoop As Long, sQuote As String, sTempA As String, sTempB As String, sTempC As String, sTempD As String, sTempE As String, sTempF As String, sTempG As String
Dim iTempA As Long, iTempB As Long, iTempC As Long, iTempD As Long, iTempE As Long, iTempF As Long, iTempG As Long
Dim iCloseTagA As Long, iStartVal As Long
Dim sCloseAddr As String, sHref As String, sAmp As String, sDelimiter As String, sMarkStart As String
Dim iUboundR As Long, iUboundC As Long, aDump() As Variant
Dim Ws1 As Long, iNextRow As Long
Dim sHeader As String, aSplitMe As Variant
Dim sClassLabel As String, sClassValue As String, sCloseTag As String
Dim aHeaders() As Variant
'OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO
Erase aDump: Erase aHeaders
Ws1 = Worksheets(MyTab).Index
iNextRow = MyRow
sQuote = """"
sWebLink = Worksheets(Ws1).Cells(iNextRow, MyLinkCol) '"https://www.fa
'OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO
'create the markers for the strings:
sClassLabel = "tech_title" & sQuote & ">"
sClassValue = "tech_data" & sQuote & ">"
sCloseTag = "</div>"
sCloseAddr = "</a>"
sHref = "href"
sAmp = " &"
sDelimiter = "|"
sMarkStart = ">"
'OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO
'SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
sHeader = "Product Number^Brand^Application^Uses^Width^Colors^Vertical Repeat^Horizontal Repeat^Railroaded^Fire Codes^Double Rubs^Content^Finish^Backing^Categories^Design Types^Scale^Cleaning Codes^Collection^Date Booked^Book^Country^Additional Product Notes^"
'Split String (Populate Public Array: Row1-Headers)
aSplitMe = Split(sHeader, "^")
'AAAAAAAAAAAAAAAAAAAAAAAAAAAAA
iUboundC = Len(sHeader) - Len(Replace(sHeader, "^", "", , , vbTextCompare))
aHeaders = Range(Worksheets(Ws1).Cells(iNextRow, MyDumpCol).Address, Worksheets(Ws1).Cells(iNextRow, MyDumpCol + iUboundC).Address)
'AAAAAAAAAAAAAAAAAAAAAAAAAAAAA
'Load All non blank values into the Array (Header Position)
For iLoop = LBound(aSplitMe) To UBound(aSplitMe)
If aSplitMe(iLoop) <> "" Then aHeaders(1, iLoop + 1) = aSplitMe(iLoop)
Next iLoop
'===================================================================================================
Worksheets(Ws1).Select
'''GoTo gArrayMemoryCleanupOnly
Set Destination = Range(Worksheets(Ws1).Cells(1, MyDumpCol).Address)
Destination.Resize(UBound(aHeaders, 1), UBound(aHeaders, 2)).Value = aHeaders
'===================================================================================================
'SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
'''GoTo gMemoryCleanup
'to refer to the running copy of Internet Explorer
Dim oWebBrowser As InternetExplorer
'to refer to the HTML document returned
Dim sHTML As HTMLDocument
'open Internet Explorer in memory, and go to website
Set oWebBrowser = New InternetExplorer
oWebBrowser.Visible = False
oWebBrowser.navigate sWebLink '"https://www.fa
'Wait until IE is done loading page
Do While oWebBrowser.READYSTATE <> READYSTATE_COMPLETE
Application.StatusBar = "Trying to go to website ..."
DoEvents
Loop
'show text of HTML document returned
Set sHTML = oWebBrowser.document
sTempA = sHTML.DocumentElement.innerHTML
'remove the carriage returns:
sTempA = Replace(sTempA, Chr(13), "", , , vbTextCompare)
sTempA = Replace(sTempA, Chr(10), "", , , vbTextCompare)
'AAAAAAAAAAAAAAAAAAAAAAAAAAAAA
iUboundR = Len(sTempA) - Len(Replace(sTempA, "src=", "", , , vbTextCompare))
aDump = Range(Worksheets(Ws1).Cells(iNextRow, MyDumpCol).Address, Worksheets(Ws1).Cells(iNextRow, MyDumpCol + iUboundC).Address)
'AAAAAAAAAAAAAAAAAAAAAAAAAAAAA
'Initiate the loop
iTempA = 1
Application.StatusBar = "Reviewing the HTML string"
'Review the HTML String - start over after setting the iTempA variable
gReviewHTMLstring:
'++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
'Provide escape route during testing:
DoEvents
'Create numeric checks for string positions
iTempB = InStr(iTempA, sTempA, sClassLabel, vbTextCompare)
iTempC = InStr(iTempB + 1, sTempA, sCloseTag, vbTextCompare)
iTempD = InStr(iTempC + 1, sTempA, sClassValue, vbTextCompare)
iTempE = InStr(iTempD + 1, sTempA, sCloseTag, vbTextCompare)
'Just in case values:
iCloseTagA = InStr(iTempB, sTempA, sCloseAddr) '"</a>")
iStartVal = InStrRev(sTempA, ">", iCloseTagA - 1, vbTextCompare)
gTestForContinue:
If iTempB = 0 Or iTempC = 0 Or iTempD = 0 Or iTempE = 0 Then GoTo gFinishedWithReview
If iTempC > iTempD Then MsgBox "Error: Mid function not available due to mismatched mid/end values"
If iTempD > iTempE Then MsgBox "Error: Mid function not available due to mismatched mid/end values"
'YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY
'YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY
gExtractData:
'Extract data
'Extract the Label
sTempB = Mid(sTempA, iTempB, iTempC - iTempB)
sTempB = Replace(sTempB, sClassLabel, "", , , vbTextCompare)
'Extract the Value
sTempC = Mid(sTempA, iTempD, iTempE - iTempD)
If InStr(1, sTempC, sHref, vbTextCompare) > 0 Then 'sHref = "href"
If iCloseTagA < iTempE And iStartVal > iTempB Then
sTempC = RestringHTMLLinks(Replace(sTempC, sClassValue, "", , , vbTextCompare), sCloseAddr, sDelimiter, sMarkStart)
Else
sTempC = ""
End If
End If
sTempC = Replace(sTempC, sClassValue, "", , , vbTextCompare)
If InStr(1, sTempC, sAmp, vbTextCompare) > 0 Then sTempC = Replace(sTempC, sAmp, " and", , , vbTextCompare) 'sAmp = "&"
sTempC = Trim(sTempC)
gCheckForMatchingHeader:
iTempG = 0
'check to see if the Label matches a correct Header value
For iLoop = LBound(aDump, 2) To UBound(aDump, 2)
If UCase(sTempB) = UCase(aHeaders(1, iLoop)) Then
iTempG = iLoop
GoTo gMatchMadeDoSomething
End If
Next iLoop
gMatchMadeDoSomething:
'Test if Col Position Defined
If iTempG = 0 Then GoTo gIncrementStartSearchPosition
'populate Array
Application.StatusBar = "Processing a Match: " & iNextRow & " | " & sTempB & "(" & Left(sTempC, 50) & ")"
aDump(1, iTempG) = sTempC
GoTo gIncrementStartSearchPosition:
gIncrementStartSearchPosition:
iTempA = InStr(iTempD + 1, sTempA, sClassLabel, vbTextCompare)
'A:Stop the process - no more matches
If iTempA = 0 Then GoTo gFinishedWithReview
'B: Check the remainder of the string - more matches available
GoTo gReviewHTMLstring
'++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
gFinishedWithReview:
'Dump the results:
'===================================================================================================
Worksheets(Ws1).Select
Set Destination = Range(Worksheets(Ws1).Cells(iNextRow, MyDumpCol).Address)
Destination.Resize(UBound(aDump, 1), UBound(aDump, 2)).Value = aDump
'===================================================================================================
GoTo gMemoryCleanup
gMemoryCleanup:
'(Close down oWebBrowser) / (Reset status bar)
oWebBrowser.Quit
Set oWebBrowser = Nothing
Application.StatusBar = ""
gArrayMemoryCleanupOnly:
'Release Array memory
Erase aDump: Erase aHeaders
MsgBox "Done"
End Function
Function RestringHTMLLinks(myInput As Variant, myReDeliminate As Variant, myNewDelimiter As Variant, myStartChar As Variant) As String
Dim sInput As String, sPreDel As String, sPostDel As String, sStartChar As String
Dim sTempInput As String, sTempPreDel As String, sTempPostDel As String
Dim aSplitMe As Variant, sConcat As String, iCount As Long, iLoop As Long
Dim iLeft As Long
sInput = myInput
sPreDel = myReDeliminate
sPostDel = myNewDelimiter
sStartChar = myStartChar
'Convert for Splitting (replace old delimiter with new one)
sInput = Replace(sInput, sPreDel, sPostDel, , , vbTextCompare)
'Create Array for looping:
'SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
'Split String (Populate Public Array: Row1-Headers)
aSplitMe = Split(sInput, sPostDel)
'SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
For iLoop = LBound(aSplitMe) To UBound(aSplitMe)
'Create a marker for the first character position
iLeft = InStr(1, aSplitMe(iLoop), sStartChar, vbTextCompare)
'Route based on if marker was found:
If iLeft > 0 Then GoTo gExtractValue
GoTo gNextLoop
gExtractValue:
'add the value to the string being returned
sConcat = sConcat & sPostDel & Right(aSplitMe(iLoop), Len(aSplitMe(iLoop)) - iLeft)
GoTo gNextLoop
gNextLoop:
Next iLoop
gFinishedLoadResults:
RestringHTMLLinks = sConcat
If Left(RestringHTMLLinks, 1) = sPostDel Then RestringHTMLLinks = Right(RestringHTMLLinks, Len(RestringHTMLLinks) - 1)
End Function
Dim sInput As String, sPreDel As String, sPostDel As String, sStartChar As String
Dim sTempInput As String, sTempPreDel As String, sTempPostDel As String
Dim aSplitMe As Variant, sConcat As String, iCount As Long, iLoop As Long
Dim iLeft As Long
sInput = myInput
sPreDel = myReDeliminate
sPostDel = myNewDelimiter
sStartChar = myStartChar
'Convert for Splitting (replace old delimiter with new one)
sInput = Replace(sInput, sPreDel, sPostDel, , , vbTextCompare)
'Create Array for looping:
'SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
'Split String (Populate Public Array: Row1-Headers)
aSplitMe = Split(sInput, sPostDel)
'SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
For iLoop = LBound(aSplitMe) To UBound(aSplitMe)
'Create a marker for the first character position
iLeft = InStr(1, aSplitMe(iLoop), sStartChar, vbTextCompare)
'Route based on if marker was found:
If iLeft > 0 Then GoTo gExtractValue
GoTo gNextLoop
gExtractValue:
'add the value to the string being returned
sConcat = sConcat & sPostDel & Right(aSplitMe(iLoop), Len(aSplitMe(iLoop)) - iLeft)
GoTo gNextLoop
gNextLoop:
Next iLoop
gFinishedLoadResults:
RestringHTMLLinks = sConcat
If Left(RestringHTMLLinks, 1) = sPostDel Then RestringHTMLLinks = Right(RestringHTMLLinks, Len(RestringHTMLLinks) - 1)
End Function