When a Website has data arranged in some type of table format:
...But you would like it to look something like this (In Excel):
Check the source code for Labels to use (and adapt code as needed)
1) Create a string that contains the Headers
2) Create a list of links that contain the individual pages
3) Create a loop that calls the function and loads data for each row based on the Column A value
4) Each website is different - code looks at things like:
a) Class Labels / Values (["my_title">"], ["my_data">""])
b) Close Tags (</span>)
2) Create a list of links that contain the individual pages
3) Create a loop that calls the function and loads data for each row based on the Column A value
4) Each website is different - code looks at things like:
a) Class Labels / Values (["my_title">"], ["my_data">""])
b) Close Tags (</span>)
Call the function using this method:
Here is a sample of the code:
Function WebScrape_ProductPage_TV(MyTab As Variant, MyRow As Variant, MyDumpCol As Variant, MyLinkCol As Variant)
'Enum READYSTATE
' READYSTATE_UNINITIALIZED = 0
' READYSTATE_LOADING = 1
' READYSTATE_LOADED = 2
' READYSTATE_INTERACTIVE = 3
' READYSTATE_COMPLETE = 4
'End Enum
'===================================================================================================================================
Dim iLabelStart As Long, iLabelMid As Long, iLabelEnd As Long
Dim iValueStart As Long, iValueMid As Long, iValueEnd As Long
Dim sTempLabel As String, sTempValue As String
Dim bStartMidEndOK As Boolean
Dim sClassLabel As String, sClassValue As String
Dim sSpanStart As String, sSpanEnd As String
sSpanStart = ">"
sSpanEnd = "</span>"
sClassLabel = "col6 acol12 valueWrapper attrLabel "
sClassValue = "col6 acol12 valueWrapper attrValue"
'===================================================================================================================================
Dim sWebLink As String
Dim iLoop As Long, sQuote As String, sTempA As String, sTempB As String, sTempC As String, sTempD As String, sTempE As String, sTempF As String, sTempG As String
Dim iTempA As Long, iTempB As Long, iTempC As Long, iTempD As Long, iTempE As Long, iTempF As Long, iTempG As Long
Dim iCloseTagA As Long, iStartVal As Long
Dim sHref As String, sAmp As String, sDelimiter As String, sMarkStart As String
Dim iUboundR As Long, iUboundC As Long, aDump() As Variant
Dim Ws1 As Long, iNextRow As Long
Dim sHeader As String, aSplitMe As Variant
Dim aHeaders() As Variant
'OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO
Erase aDump: Erase aHeaders
Ws1 = Worksheets(MyTab).Index
iNextRow = MyRow
sQuote = """"
'SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
sHeader = "Heat^Mildew^Color^Fabric^Width^Cat^Gvmt^Put Up^Style^Weight^Warranty^Water^Count^Package^"
'Split String (Populate Public Array: Row1-Headers)
aSplitMe = Split(sHeader, "^")
'AAAAAAAAAAAAAAAAAAAAAAAAAAAAA
iUboundC = Len(sHeader) - Len(Replace(sHeader, "^", "", , , vbTextCompare))
aHeaders = Range(Worksheets(Ws1).Cells(iNextRow, MyDumpCol).Address, Worksheets(Ws1).Cells(iNextRow, MyDumpCol + iUboundC).Address)
'AAAAAAAAAAAAAAAAAAAAAAAAAAAAA
'Load All non blank values into the Array (Header Position)
For iLoop = LBound(aSplitMe) To UBound(aSplitMe)
If aSplitMe(iLoop) <> "" Then aHeaders(1, iLoop + 1) = aSplitMe(iLoop)
Next iLoop
'===================================================================================================
Worksheets(Ws1).Select
Set Destination = Range(Worksheets(Ws1).Cells(1, MyDumpCol).Address)
Destination.Resize(UBound(aHeaders, 1), UBound(aHeaders, 2)).Value = aHeaders
'===================================================================================================
'SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
'to refer to the running copy of Internet Explorer
Dim oWebBrowser As InternetExplorer
'to refer to the HTML document returned
Dim sHTML As HTMLDocument
'open Internet Explorer in memory, and go to website
Set oWebBrowser = New InternetExplorer
oWebBrowser.Visible = False
oWebBrowser.navigate sWebLink '"https://www.tv
'Wait until IE is done loading page
Do While oWebBrowser.READYSTATE <> READYSTATE_COMPLETE
Application.StatusBar = "Trying to go to website ..."
DoEvents
Loop
'show text of HTML document returned
Set sHTML = oWebBrowser.document
sTempA = sHTML.DocumentElement.innerHTML
sTempA = Replace(sTempA, Chr(13), "", , , vbTextCompare)
sTempA = Replace(sTempA, Chr(10), "", , , vbTextCompare)
sTempA = Replace(sTempA, vbTab, "", , , vbTextCompare)
'AAAAAAAAAAAAAAAAAAAAAAAAAAAAA
aDump = Range(Worksheets(Ws1).Cells(iNextRow, MyDumpCol).Address, Worksheets(Ws1).Cells(iNextRow, MyDumpCol + iUboundC).Address)
'AAAAAAAAAAAAAAAAAAAAAAAAAAAAA
'===================================================================================================================================
'===================================================================================================================================
'Initiate the loop
iTempA = 1
Application.StatusBar = "Reviewing the HTML string"
'===================================================================================================================================
'===================================================================================================================================
'===================================================================================================================================
'===================================================================================================================================
'===================================================================================================================================
gReviewHTMLstringReWrite:
DoEvents
bStartMidEndOK = True
sTempLabel = ""
sTempValue = ""
'===================================================================================================================================
'===================================================================================================================================
'===================================================================================================================================
'===================================================================================================================================
'Set the 3 markers for [Label][Value]
iLabelStart = InStr(iTempA, sTempA, sClassLabel, vbTextCompare) 'InStr(1, sTempA, "col6 acol12 valueWrapper attrLabel ", vbTextCompare)
iLabelEnd = InStr(iLabelStart + 1, sTempA, sSpanEnd, vbTextCompare) 'InStr(iLabelStart + 1, sTempA, "</span>", vbTextCompare)
iLabelMid = InStrRev(sTempA, sSpanStart, iLabelEnd - 1, vbTextCompare) 'InStrRev(sTempA, "<", iLabelEnd - 1, vbTextCompare)
iValueStart = InStr(iTempA, sTempA, sClassValue, vbTextCompare) 'InStr(1, sTempA, "col6 acol12 valueWrapper attrValue", vbTextCompare)
iValueEnd = InStr(iValueStart + 1, sTempA, sSpanEnd, vbTextCompare) 'InStr(iValueStart + 1, sTempA, "</span>", vbTextCompare)
iValueMid = InStrRev(sTempA, sSpanStart, iValueEnd - 1, vbTextCompare) 'InStrRev(sTempA, "<", iValueEnd - 1, vbTextCompare)
'===================================================================================================================================
'Test Label / Value Markers:
If iLabelMid <= iLabelStart Or iLabelMid >= iLabelEnd Then bStartMidEndOK = False
If iValueMid <= iValueStart Or iValueMid >= iValueEnd Then bStartMidEndOK = False
If bStartMidEndOK = False Then
'send it to the error handler
GoTo gMark3Error
Else
'Continue processing
GoTo gMark3Valid
End If
'===================================================================================================================================
gMark3Error:
Debug.Print "The Start, Mid or End value for a string MARKING step [Label][Value] was invalid."
GoTo gFinishedWithReview
gMark3Valid:
sTempLabel = Trim(Mid(sTempA, iLabelMid + 1, iLabelEnd - iLabelMid - 1))
sTempValue = Trim(Mid(sTempA, iValueMid + 1, iValueEnd - iValueMid - 1))
GoTo gCheckForMatchingHeader
'===================================================================================================================================
gCheckForMatchingHeader:
'Reset Value so no value is over-written when no match to header is made
iTempG = 0
'check to see if the Label matches a correct Header value
For iLoop = LBound(aDump, 2) To UBound(aDump, 2)
If UCase(sTempLabel) = UCase(aHeaders(1, iLoop)) Then
iTempG = iLoop
GoTo gMatchMadeDoSomething
End If
Next iLoop
'===================================================================================================================================
gMatchMadeDoSomething:
'Test if Col Position Defined
If iTempG = 0 Then GoTo gIncrementStartSearchPosition
'populate Array
Application.StatusBar = "Processing a Match: " & iNextRow & " | " & sTempLabel & "(" & Left(sTempValue, 50) & ")"
aDump(1, iTempG) = sTempValue
GoTo gIncrementStartSearchPosition
gIncrementStartSearchPosition:
iTempA = InStr(iLabelStart + 1, sTempA, sClassLabel, vbTextCompare)
'A:Stop the process - no more matches
If iTempA = 0 Then GoTo gFinishedWithReview
'B: Check the remainder of the string - more matches available
GoTo gReviewHTMLstringReWrite
'===================================================================================================================================
'===================================================================================================================================
'===================================================================================================================================
'++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
gFinishedWithReview:
'Dump the results:
'===================================================================================================
Worksheets(Ws1).Select
Set Destination = Range(Worksheets(Ws1).Cells(iNextRow, MyDumpCol).Address)
Destination.Resize(UBound(aDump, 1), UBound(aDump, 2)).Value = aDump
'===================================================================================================
GoTo gMemoryCleanup
gMemoryCleanup:
'close down oWebBrowser and reset status bar
oWebBrowser.Quit
Set oWebBrowser = Nothing
Application.StatusBar = ""
gArrayMemoryCleanupOnly:
'Release Array memory
Erase aDump: Erase aHeaders
MsgBox "Done"
End Function
'Enum READYSTATE
' READYSTATE_UNINITIALIZED = 0
' READYSTATE_LOADING = 1
' READYSTATE_LOADED = 2
' READYSTATE_INTERACTIVE = 3
' READYSTATE_COMPLETE = 4
'End Enum
'===================================================================================================================================
Dim iLabelStart As Long, iLabelMid As Long, iLabelEnd As Long
Dim iValueStart As Long, iValueMid As Long, iValueEnd As Long
Dim sTempLabel As String, sTempValue As String
Dim bStartMidEndOK As Boolean
Dim sClassLabel As String, sClassValue As String
Dim sSpanStart As String, sSpanEnd As String
sSpanStart = ">"
sSpanEnd = "</span>"
sClassLabel = "col6 acol12 valueWrapper attrLabel "
sClassValue = "col6 acol12 valueWrapper attrValue"
'===================================================================================================================================
Dim sWebLink As String
Dim iLoop As Long, sQuote As String, sTempA As String, sTempB As String, sTempC As String, sTempD As String, sTempE As String, sTempF As String, sTempG As String
Dim iTempA As Long, iTempB As Long, iTempC As Long, iTempD As Long, iTempE As Long, iTempF As Long, iTempG As Long
Dim iCloseTagA As Long, iStartVal As Long
Dim sHref As String, sAmp As String, sDelimiter As String, sMarkStart As String
Dim iUboundR As Long, iUboundC As Long, aDump() As Variant
Dim Ws1 As Long, iNextRow As Long
Dim sHeader As String, aSplitMe As Variant
Dim aHeaders() As Variant
'OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO
Erase aDump: Erase aHeaders
Ws1 = Worksheets(MyTab).Index
iNextRow = MyRow
sQuote = """"
'SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
sHeader = "Heat^Mildew^Color^Fabric^Width^Cat^Gvmt^Put Up^Style^Weight^Warranty^Water^Count^Package^"
'Split String (Populate Public Array: Row1-Headers)
aSplitMe = Split(sHeader, "^")
'AAAAAAAAAAAAAAAAAAAAAAAAAAAAA
iUboundC = Len(sHeader) - Len(Replace(sHeader, "^", "", , , vbTextCompare))
aHeaders = Range(Worksheets(Ws1).Cells(iNextRow, MyDumpCol).Address, Worksheets(Ws1).Cells(iNextRow, MyDumpCol + iUboundC).Address)
'AAAAAAAAAAAAAAAAAAAAAAAAAAAAA
'Load All non blank values into the Array (Header Position)
For iLoop = LBound(aSplitMe) To UBound(aSplitMe)
If aSplitMe(iLoop) <> "" Then aHeaders(1, iLoop + 1) = aSplitMe(iLoop)
Next iLoop
'===================================================================================================
Worksheets(Ws1).Select
Set Destination = Range(Worksheets(Ws1).Cells(1, MyDumpCol).Address)
Destination.Resize(UBound(aHeaders, 1), UBound(aHeaders, 2)).Value = aHeaders
'===================================================================================================
'SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
'to refer to the running copy of Internet Explorer
Dim oWebBrowser As InternetExplorer
'to refer to the HTML document returned
Dim sHTML As HTMLDocument
'open Internet Explorer in memory, and go to website
Set oWebBrowser = New InternetExplorer
oWebBrowser.Visible = False
oWebBrowser.navigate sWebLink '"https://www.tv
'Wait until IE is done loading page
Do While oWebBrowser.READYSTATE <> READYSTATE_COMPLETE
Application.StatusBar = "Trying to go to website ..."
DoEvents
Loop
'show text of HTML document returned
Set sHTML = oWebBrowser.document
sTempA = sHTML.DocumentElement.innerHTML
sTempA = Replace(sTempA, Chr(13), "", , , vbTextCompare)
sTempA = Replace(sTempA, Chr(10), "", , , vbTextCompare)
sTempA = Replace(sTempA, vbTab, "", , , vbTextCompare)
'AAAAAAAAAAAAAAAAAAAAAAAAAAAAA
aDump = Range(Worksheets(Ws1).Cells(iNextRow, MyDumpCol).Address, Worksheets(Ws1).Cells(iNextRow, MyDumpCol + iUboundC).Address)
'AAAAAAAAAAAAAAAAAAAAAAAAAAAAA
'===================================================================================================================================
'===================================================================================================================================
'Initiate the loop
iTempA = 1
Application.StatusBar = "Reviewing the HTML string"
'===================================================================================================================================
'===================================================================================================================================
'===================================================================================================================================
'===================================================================================================================================
'===================================================================================================================================
gReviewHTMLstringReWrite:
DoEvents
bStartMidEndOK = True
sTempLabel = ""
sTempValue = ""
'===================================================================================================================================
'===================================================================================================================================
'===================================================================================================================================
'===================================================================================================================================
'Set the 3 markers for [Label][Value]
iLabelStart = InStr(iTempA, sTempA, sClassLabel, vbTextCompare) 'InStr(1, sTempA, "col6 acol12 valueWrapper attrLabel ", vbTextCompare)
iLabelEnd = InStr(iLabelStart + 1, sTempA, sSpanEnd, vbTextCompare) 'InStr(iLabelStart + 1, sTempA, "</span>", vbTextCompare)
iLabelMid = InStrRev(sTempA, sSpanStart, iLabelEnd - 1, vbTextCompare) 'InStrRev(sTempA, "<", iLabelEnd - 1, vbTextCompare)
iValueStart = InStr(iTempA, sTempA, sClassValue, vbTextCompare) 'InStr(1, sTempA, "col6 acol12 valueWrapper attrValue", vbTextCompare)
iValueEnd = InStr(iValueStart + 1, sTempA, sSpanEnd, vbTextCompare) 'InStr(iValueStart + 1, sTempA, "</span>", vbTextCompare)
iValueMid = InStrRev(sTempA, sSpanStart, iValueEnd - 1, vbTextCompare) 'InStrRev(sTempA, "<", iValueEnd - 1, vbTextCompare)
'===================================================================================================================================
'Test Label / Value Markers:
If iLabelMid <= iLabelStart Or iLabelMid >= iLabelEnd Then bStartMidEndOK = False
If iValueMid <= iValueStart Or iValueMid >= iValueEnd Then bStartMidEndOK = False
If bStartMidEndOK = False Then
'send it to the error handler
GoTo gMark3Error
Else
'Continue processing
GoTo gMark3Valid
End If
'===================================================================================================================================
gMark3Error:
Debug.Print "The Start, Mid or End value for a string MARKING step [Label][Value] was invalid."
GoTo gFinishedWithReview
gMark3Valid:
sTempLabel = Trim(Mid(sTempA, iLabelMid + 1, iLabelEnd - iLabelMid - 1))
sTempValue = Trim(Mid(sTempA, iValueMid + 1, iValueEnd - iValueMid - 1))
GoTo gCheckForMatchingHeader
'===================================================================================================================================
gCheckForMatchingHeader:
'Reset Value so no value is over-written when no match to header is made
iTempG = 0
'check to see if the Label matches a correct Header value
For iLoop = LBound(aDump, 2) To UBound(aDump, 2)
If UCase(sTempLabel) = UCase(aHeaders(1, iLoop)) Then
iTempG = iLoop
GoTo gMatchMadeDoSomething
End If
Next iLoop
'===================================================================================================================================
gMatchMadeDoSomething:
'Test if Col Position Defined
If iTempG = 0 Then GoTo gIncrementStartSearchPosition
'populate Array
Application.StatusBar = "Processing a Match: " & iNextRow & " | " & sTempLabel & "(" & Left(sTempValue, 50) & ")"
aDump(1, iTempG) = sTempValue
GoTo gIncrementStartSearchPosition
gIncrementStartSearchPosition:
iTempA = InStr(iLabelStart + 1, sTempA, sClassLabel, vbTextCompare)
'A:Stop the process - no more matches
If iTempA = 0 Then GoTo gFinishedWithReview
'B: Check the remainder of the string - more matches available
GoTo gReviewHTMLstringReWrite
'===================================================================================================================================
'===================================================================================================================================
'===================================================================================================================================
'++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
gFinishedWithReview:
'Dump the results:
'===================================================================================================
Worksheets(Ws1).Select
Set Destination = Range(Worksheets(Ws1).Cells(iNextRow, MyDumpCol).Address)
Destination.Resize(UBound(aDump, 1), UBound(aDump, 2)).Value = aDump
'===================================================================================================
GoTo gMemoryCleanup
gMemoryCleanup:
'close down oWebBrowser and reset status bar
oWebBrowser.Quit
Set oWebBrowser = Nothing
Application.StatusBar = ""
gArrayMemoryCleanupOnly:
'Release Array memory
Erase aDump: Erase aHeaders
MsgBox "Done"
End Function