使用VBA解析从pdftotext .txt文件输出的数据。

huangapple go评论64阅读模式
英文:

Parsing data from pdftotext .txt file output using VBA

问题

我已经实现了以下代码来提取一些数据:

用于提取两个字符串之间文本的函数

Public Function SuperMid(ByVal strMain As String, str1 As String, str2 As String, Optional reverse As Boolean) As String
    '描述:提取字符串中由str1和str2定义的两个子字符串之间的部分。
    '开发人员:Ryan Wells(wellsr.com)
    '如何使用:- 将您的主字符串和您想在主字符串中查找的2个字符串作为参数传递。
    ' - 此函数将提取第一个字符串的末尾和下一个字符串的开头之间的值。
    ' - 如果可选布尔值“reverse”为true,则会执行InStrRev搜索,以查找主字符串中子字符串的最后一个实例。
    Dim i As Integer, j As Integer, temp As Variant
    On Error GoTo errhandler:
    If reverse = True Then
        i = InStrRev(strMain, str1)
        j = InStrRev(strMain, str2)
        If Abs(j - i) < Len(str1) Then j = InStrRev(strMain, str2, i)
        If i = j Then '尝试在字符串的第二半部分寻找唯一匹配项
            j = InStrRev(strMain, str2, i - 1)
        End If
    Else
        i = InStr(1, strMain, str1)
        j = InStr(1, strMain, str2)
        If Abs(j - i) < Len(str1) Then j = InStr(i + Len(str1), strMain, str2)
        If i = j Then '尝试在字符串的第二半部分寻找唯一匹配项
            j = InStr(i + 1, strMain, str2)
        End If
    End If
    If i = 0 And j = 0 Then GoTo errhandler:
    If j = 0 Then j = Len(strMain) + Len(str2) '使其任意大
    If i = 0 Then i = Len(strMain) + Len(str1) '使其任意大
    If i > j And j <> 0 Then '交换顺序
        temp = j
        j = i
        i = temp
        temp = str2
        str2 = str1
        str1 = temp
    End If
    i = i + Len(str1)
    SuperMid = Mid(strMain, i, j - i)
    Exit Function
    errhandler:
    MsgBox "提取字符串时出错。请检查您的输入" & vbNewLine & vbNewLine & "中止", , "未找到字符串"
    End
End Function

提取子例程

Sub extractPDF()
    
    Dim phoneNumber, shippingInfo, shippingAddress, itemInfo, poNumber As String
    Dim iTxtFile As Integer
    Dim strFile As String
    Dim strFileText As String
    strFile = "C:\blah\blah\blah\#62875.txt"
    iTxtFile = FreeFile
    Open strFile For Input As FreeFile
    strFileText = Input(LOF(iTxtFile), iTxtFile)
    Close iTxtFile
    
    Dim regexPattern As String
    Dim regex As Object
    Dim matches As Object
    Dim match As Object
    
    ' 正则表达式模式
    regexPattern = "Order #\d{5}"
    
    ' 创建正则表达式对象
    Set regex = CreateObject("VBScript.RegExp")
    
    ' 设置模式并忽略大小写
    With regex
        .Pattern = regexPattern
        .IgnoreCase = True
    End With
    
    ' 执行搜索
    Set matches = regex.Execute(strFileText)
    
    ' 循环遍历匹配项
    For Each match In matches
        ' 打印匹配的值
        poNumber = Right(match, 5)
    Next match
    
    shippingInfo = SuperMid(strFileText, "SHIP TO", "BILL TO")
    shippingAddress = SuperMid(shippingInfo, "", "United States")
    phoneNumber = Application.WorksheetFunction.Clean(SuperMid(shippingInfo, "United States", "BILL TO"))
    itemInfo = SuperMid(strFileText, "ITEMS QUANTITY", "Thank you for shopping with us!")
    Debug.Print "PO #: " & poNumber
    Debug.Print "Phone Number: " & phoneNumber
    Debug.Print shippingAddress
    Debug.Print itemInfo

End Sub

这使我能够获取运输信息,我进一步将其拆分为运输地址和电话号码(如果适用),PO号和包含项目信息的文本块。我遇到困难的是如何从itemInfo块中提取SKU和数量数据。根据以前的PDF,SKU行总是在数量行之后。因此,在这个示例中,SKU是VAR5M,数量是1(如果是2,它将是2 of 2)。有关如何实现我所需的最佳方法的任何想法吗?是否有比我已经设计的更好的实现方式?感谢您的帮助。

英文:

I am trying to implement a parsing function that will grab data from parts of a .txt file created using pdftotext. I hate PDFs! Essentially, I use pdftotext on a PDF file using the -raw option and I get a file like this:

SPORTS FANZ Order #62659
June 24, 2023
SHIP TO
Abe Lincoln
123 Main St
New York, NY 12345
United States
BILL TO
Abe Lincoln
123 Main St
New York, NY 12345
United States
ITEMS QUANTITY
Virginia Tech Hokies Basketball Cassell Coliseum Panoramic
Picture
Virginia Tech Hokies Panoramic Picture Select
VAT5M
1 of 1
Thank you for shopping with us!
Sports Fanz
123 Liberty St, Chester NY 12345, United States

Example with phone number and quantity of 2:

SPORTS FANZ Order #12345
June 24, 2023
SHIP TO
Abe Lincoln
123 Main St
New York, NY 12345
United States
(123) 123-4567
BILL TO
Abe Lincoln
123 Main St
New York, NY 12345
United States
ITEMS QUANTITY
Virginia Tech Hokies Basketball Cassell Coliseum Panoramic
Picture
Virginia Tech Hokies Panoramic Picture Select
VAT5M
2 of 2
Thank you for shopping with us!
Sports Fanz
123 Liberty St, Chester NY 12345, United States

Example with phone number (different format) and two SKUs:

SPORTS FANZ Order #58083
January 6, 2023
SHIP TO
Abe Lincoln
123 Main St
New York, NY 12345
United States
+12345678900
BILL TO
Abe Lincoln
123 Main St
New York, NY 12345
United States
ITEMS QUANTITY
Nebraska Cornhuskers Women&#39;s Volleyball Devaney Center Panoramic Picture
Nebraska Cornhuskers Panoramic Picture Select Frame
UNE11M
1 of 1
Kansas City Chiefs Super Bowl 54 Champions Panoramic Picture
Kansas City Chiefs SB 54 Champions Panoramic Picture Unframed
NFLSBC20CHF
1 of 1
Thank you for shopping with us!
Sports Fanz
123 Liberty St, Chester NY 12345, United States

I've implemented the following code already to grab some of the data:

Function for grabbing text between two strings

Public Function SuperMid(ByVal strMain As String, str1 As String, str2 As String, Optional reverse As Boolean) As String
&#39;DESCRIPTION: Extract the portion of a string between the two substrings defined in str1 and str2.
&#39;DEVELOPER: Ryan Wells (wellsr.com)
&#39;HOW TO USE: - Pass the argument your main string and the 2 strings you want to find in the main string.
&#39; - This function will extract the values between the end of your first string and the beginning
&#39; of your next string.
&#39; - If the optional boolean &quot;reverse&quot; is true, an InStrRev search will occur to find the last
&#39; instance of the substrings in your main string.
Dim i As Integer, j As Integer, temp As Variant
On Error GoTo errhandler:
If reverse = True Then
i = InStrRev(strMain, str1)
j = InStrRev(strMain, str2)
If Abs(j - i) &lt; Len(str1) Then j = InStrRev(strMain, str2, i)
If i = j Then &#39;try to search 2nd half of string for unique match
j = InStrRev(strMain, str2, i - 1)
End If
Else
i = InStr(1, strMain, str1)
j = InStr(1, strMain, str2)
If Abs(j - i) &lt; Len(str1) Then j = InStr(i + Len(str1), strMain, str2)
If i = j Then &#39;try to search 2nd half of string for unique match
j = InStr(i + 1, strMain, str2)
End If
End If
If i = 0 And j = 0 Then GoTo errhandler:
If j = 0 Then j = Len(strMain) + Len(str2) &#39;just to make it arbitrarily large
If i = 0 Then i = Len(strMain) + Len(str1) &#39;just to make it arbitrarily large
If i &gt; j And j &lt;&gt; 0 Then &#39;swap order
temp = j
j = i
i = temp
temp = str2
str2 = str1
str1 = temp
End If
i = i + Len(str1)
SuperMid = Mid(strMain, i, j - i)
Exit Function
errhandler:
MsgBox &quot;Error extracting strings. Check your input&quot; &amp; vbNewLine &amp; vbNewLine &amp; &quot;Aborting&quot;, , &quot;Strings not found&quot;
End
End Function

Extraction Sub

Sub extractPDF()
Dim phoneNumber, shippingInfo, shippingAddress, itemInfo, poNumber As String
Dim iTxtFile As Integer
Dim strFile As String
Dim strFileText As String
strFile = &quot;C:\blah\blah\blah\#62875.txt&quot;
iTxtFile = FreeFile
Open strFile For Input As FreeFile
strFileText = Input(LOF(iTxtFile), iTxtFile)
Close iTxtFile
Dim regexPattern As String
Dim regex As Object
Dim matches As Object
Dim match As Object
&#39; Regular expression pattern
regexPattern = &quot;Order #\d{5}&quot;
&#39; Create a regular expression object
Set regex = CreateObject(&quot;VBScript.RegExp&quot;)
&#39; Set the pattern and ignore case
With regex
.Pattern = regexPattern
.IgnoreCase = True
End With
&#39; Perform the search
Set matches = regex.Execute(strFileText)
&#39; Loop through the matches
For Each match In matches
&#39; Print the matched value
poNumber = Right(match, 5)
Next match
shippingInfo = SuperMid(strFileText, &quot;SHIP TO&quot;, &quot;BILL TO&quot;)
shippingAddress = SuperMid(shippingInfo, &quot;&quot;, &quot;United States&quot;)
phoneNumber = Application.WorksheetFunction.Clean(SuperMid(shippingInfo, &quot;United States&quot;, &quot;BILL TO&quot;))
itemInfo = SuperMid(strFileText, &quot;ITEMS QUANTITY&quot;, &quot;Thank you for shopping with us!&quot;)
Debug.Print &quot;PO #: &quot; &amp; poNumber
Debug.Print &quot;Phone Number: &quot; &amp; phoneNumber
Debug.Print shippingAddress
Debug.Print itemInfo
End Sub

This gets me the shipping info, which I further break down into shipping address and phone number (if applicable), PO #, and the block of text containing the item information. What I'm struggling with is how to extract SKU and quantity data from the itemInfo block. Based on previous PDFs, the SKU line is always followed by the quantity line. So, in this example, SKU is VAR5M and quantity is 1 (if it was 2 it would say 2 of 2). Any ideas on the best way to implement what I need? Is there a better way to implement my needs than what I've already designed? Thanks for your help.

答案1

得分: 2

以下是翻译好的部分:

请尝试下一个函数。它使用数组并且应该足够快:

Function ExtractDat(arrTxt) As Variant
Dim arrFin, mtch, arrH, arr, i As Long, k As Long
'                                                           0                       1                        2                3             4       5       6          7             8                9                      10        11      12     13        14         15              16             17             18
Const header As String = "Order number, (Ship To) Name, Address1, Address2, City, State, Zip, Country, Phone, (Bill To) Name, Address, City, State, Zip, Country, SKU1, Value SKU1, SKU2, Value SKU2"
arrH = Split(header, ",")
ReDim arrFin(UBound(arrH))
With CreateObject("Vbscript.RegExp")
.Pattern = "\d{5}"
.Global = False
arrFin(0) = .Execute(arrTxt(0))(0) 'order number
End With
arrFin(1) = arrTxt(3)             'Send To Name
arrFin(2) = arrTxt(4)             'Send To Address
arrFin(3) = ""                    'No second Address (assumption...)
mtch = Application.match("BILL TO", arrTxt, 0)
If IsError(mtch) Then MsgBox """BILL TO"" & "" could not be found in the analyzed data...", vbInformation, "BILL TO missing": Exit Function
If mtch = 8 Then 'no Phone number existing, no second Address, too...
arr = Split(arrTxt(5), ", ") 'split City from State and Zip
arrFin(4) = arr(0)            'Send To City
arrFin(5) = Split(arr(1))(0)  'Send To State
arrFin(6) = Split(arr(1))(1)  'Send To Zip
arrFin(7) = arrTxt(6)         'Country
arrFin(8) = ""                'No Phone number
ElseIf mtch = 9 Then
If InStr(arrTxt(5), ",") = 0 Then 'no comma in string (second address...)
arrFin(3) = arrTxt(5)         'second Address
arr = Split(arrTxt(6), ", ")  'split City from State and Zip
arrFin(4) = arr(0)            'Send To City
arrFin(5) = Split(arr(1))(0)  'Send To State
arrFin(6) = Split(arr(1))(1)  'Send To Zip
arrFin(7) = arrTxt(6)         'Country
arrFin(8) = ""                'No Phone number
Else    'No second address
arr = Split(arrTxt(5), ", ")   'split City from State and Zip
arrFin(4) = arr(0)            'Send To City
arrFin(5) = Split(arr(1))(0)  'Send To State
arrFin(6) = Split(arr(1))(1)  'Send To Zip
arrFin(7) = arrTxt(6)         'Country
arrFin(8) = arrTxt(7)         'Phone number
End If
ElseIf mtch = 10 Then 'second Address and Phone number exist
arrFin(3) = arrTxt(5)         'second Address
arr = Split(arrTxt(6), ", ")  'split City from State and Zip
arrFin(4) = arr(0)            'Send To City
arrFin(5) = Split(arr(0))(0)  'Send To State
arrFin(6) = Split(arr(0))(1)  'Send To Zip
arrFin(7) = arrTxt(7)         'Country
arrFin(8) = arrTxt(8)         'Phone number
End If
arrFin(9) = arrTxt(mtch)        'Bill To Name
arrFin(10) = arrTxt(mtch + 1)   'Bill To Address
arr = Split(arrTxt(mtch + 2), ", ")
arrFin(11) = arr(0)             'Bill To City
arrFin(12) = Split(arr(1))(0)   'Bill To State
arrFin(13) = Split(arr(1))(1)   'Bill To Zip
arrFin(14) = arrTxt(mtch + 3)    'Bill To Country
'extract SCUs and their values:
For i = 0 To UBound(arrTxt)
If arrTxt(i) Like "#* of #*" Then
arrFin(15 + k) = arrTxt(i - 1)
arrFin(16 + k) = Split(arrTxt(i))(0)
k = k + 2
End If
Next i
ExtractDat = Array(arrH, arrFin)
End Function

它可以用以下代码在活动工作表中返回:

Sub UseExtractDat()
Dim strFile As String, arrT, retArr
strFile = "C:\blah\blah\blah\#62875.txt"
'Place the content of the text file in an array (splitting by end of line)
arrT = Split(CreateObject("Scripting.FileSystemObject").OpenTextFile(strFile, 1).ReadAll, vbCrLf)
retArr = ExtractDat(arrT)
Range("A1").Resize(1, UBound(retArr(0)) + 1).Value2 = retArr(0)
Range("A2").Resize(1, UBound(retArr(1)) + 1).Value2 = retArr(1)
End Sub

测试后请提供一些反馈。

英文:

Please, try the next function. It uses arrays and should be fast enough:

Function ExtractDat(arrTxt) As Variant
Dim arrFin, mtch, arrH, arr, i As Long, k As Long
&#39;                                                           0                       1                        2                3             4       5       6          7             8                9                      10        11      12     13        14         15              16             17             18
Const header As String = &quot;Order number, (Ship To) Name, Address1, Address2, City, State, Zip, Country, Phone, (Bill To) Name, Address, City, State, Zip, Country, SKU1, Value SKU1, SKU2, Value SKU2&quot;
arrH = Split(header, &quot;,&quot;)
ReDim arrFin(UBound(arrH))
With CreateObject(&quot;Vbscript.RegExp&quot;)
.Pattern = &quot;\d{5}&quot;
.Global = False
arrFin(0) = .Execute(arrTxt(0))(0) &#39;order number
End With
arrFin(1) = arrTxt(3)             &#39;Send To Name
arrFin(2) = arrTxt(4)             &#39;Send To Address
arrFin(3) = &quot;&quot;                    &#39;No second Address (assumption...)
mtch = Application.match(&quot;BILL TO&quot;, arrTxt, 0)
If IsError(mtch) Then MsgBox &quot;&quot;&quot;BILL TO&quot; &amp; &quot; could not be found in the analyzed data...&quot;, vbInformation, &quot;BILL TO missing&quot;: Exit Function
If mtch = 8 Then &#39;no Phone number existing, no second Address, too...
arr = Split(arrTxt(5), &quot;, &quot;) &#39;split City from State and Zip
arrFin(4) = arr(0)            &#39;Send To City
arrFin(5) = Split(arr(1))(0)  &#39;Send To State
arrFin(6) = Split(arr(1))(1)  &#39;Send To Zip
arrFin(7) = arrTxt(6)         &#39;Country
arrFin(8) = &quot;&quot;                &#39;No Phone number
ElseIf mtch = 9 Then
If InStr(arrTxt(5), &quot;,&quot;) = 0 Then &#39;no comma in string (second address...)
arrFin(3) = arrTxt(5)         &#39;second Address
arr = Split(arrTxt(6), &quot;, &quot;)  &#39;split City from State and Zip
arrFin(4) = arr(0)            &#39;Send To City
arrFin(5) = Split(arr(1))(0)  &#39;Send To State
arrFin(6) = Split(arr(1))(1)  &#39;Send To Zip
arrFin(7) = arrTxt(6)         &#39;Country
arrFin(8) = &quot;&quot;                &#39;No Phone number
Else    &#39;No second address
arr = Split(arrTxt(5), &quot;, &quot;)   &#39;split City from State and Zip
arrFin(4) = arr(0)            &#39;Send To City
arrFin(5) = Split(arr(1))(0)  &#39;Send To State
arrFin(6) = Split(arr(1))(1)  &#39;Send To Zip
arrFin(7) = arrTxt(6)         &#39;Country
arrFin(8) = arrTxt(7)         &#39;Phone number
End If
ElseIf mtch = 10 Then &#39;second Address and Phone number exist
arrFin(3) = arrTxt(5)         &#39;second Address
arr = Split(arrTxt(6), &quot;, &quot;)  &#39;split City from State and Zip
arrFin(4) = arr(0)            &#39;Send To City
arrFin(5) = Split(arr(0))(0)  &#39;Send To State
arrFin(6) = Split(arr(0))(1)  &#39;Send To Zip
arrFin(7) = arrTxt(7)         &#39;Country
arrFin(8) = arrTxt(8)         &#39;Phone number
End If
arrFin(9) = arrTxt(mtch)        &#39;Bill To Name
arrFin(10) = arrTxt(mtch + 1)   &#39;Bill To Address
arr = Split(arrTxt(mtch + 2), &quot;, &quot;)
arrFin(11) = arr(0)             &#39;Bill To City
arrFin(12) = Split(arr(1))(0)   &#39;Bill To State
arrFin(13) = Split(arr(1))(1)   &#39;Bill To Zip
arrFin(14) = arrTxt(mtch + 3)    &#39;Bill To Country
&#39;extract SCUs and their values:
For i = 0 To UBound(arrTxt)
If arrTxt(i) Like &quot;#* of #*&quot; Then
arrFin(15 + k) = arrTxt(i - 1)
arrFin(16 + k) = Split(arrTxt(i))(0)
k = k + 2
End If
Next i
ExtractDat = Array(arrH, arrFin)
End Function
```
It can be used to return in the active sheet with such a code. It processes a text file, placing its content in an array and returns on the first two rows of the active sheet:
```
Sub UseExtractDat()
Dim strFile As String, arrT, retArr
strFile = &quot;C:\blah\blah\blah\#62875.txt&quot;
&#39;Place the content of the text file in an array (splitting by end of line)
arrT = Split(CreateObject(&quot;Scripting.FileSystemObject&quot;).OpenTextFile(strFile, 1).ReadAll, vbCrLf)
retArr = ExtractDat(arrT)
Range(&quot;A1&quot;).Resize(1, UBound(retArr(0)) + 1).Value2 = retArr(0)
Range(&quot;A2&quot;).Resize(1, UBound(retArr(1)) + 1).Value2 = retArr(1)
End Sub
```
Please, send some feedback after testing it.
</details>
# 答案2
**得分**: 1
以下是代码的翻译部分:
```vba
如果您在单元格 A1 中存储了一个文本字符串,并且您想要使用以下代码获取 SKU 和数量。
Sub Demo()
Dim objRegExp As Object
Dim objMatches As Object
Set objRegExp = CreateObject("vbscript.regexp")
With objRegExp
.IgnoreCase = True
.Global = True
.Pattern = "([A-Z0-9]+)\s*(\d+) of \d+"
If .Test([a1]) Then
Set objMatches = objRegExp.Execute([a1])
For Each objMtch In objMatches
With objMtch.submatches
If .Count = 2 Then
SKU = .Item(0)
QTY = .Item(1)
Debug.Print "SKU:" & SKU & vbNewLine _
& "Quantity:" & QTY
End If
End With
Next
End If
End With
Set objMatches = Nothing
Set objRegExp = Nothing
End Sub
```
<details>
<summary>英文:</summary>
If you have a text string stored in cell A1 and you would get SKU and Quantity with following code.
```
Sub Demo()
Dim objRegExp As Object
Dim objMatches As Object
Set objRegExp = CreateObject(&quot;vbscript.regexp&quot;)
With objRegExp
.IgnoreCase = True
.Global = True
.Pattern = &quot;([A-Z0-9]+)\s*(\d+) of \d+&quot;
If .Test([a1]) Then
Set objMatches = objRegExp.Execute([a1])
For Each objMtch In objMatches
With objMtch.submatches
If .Count = 2 Then
SKU = .Item(0)
QTY = .Item(1)
Debug.Print &quot;SKU:&quot; &amp; SKU &amp; vbNewLine _
&amp; &quot;Quantity:&quot; &amp; QTY
End If
End With
Next
End If
End With
Set objMatches = Nothing
Set objRegExp = Nothing
End Sub
```
</details>

huangapple
  • 本文由 发表于 2023年7月11日 05:51:33
  • 转载请务必保留本文链接:https://go.coder-hub.com/76657551.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定