Word: Extract Embedded Office files, PDFs, and ZIPs

Sometimes you get a lot of Word/Excel/Powerpoint docs with embedded DOCX, XLSX, VSDX, PPTX, PDFs, and ZIP files which you need to extract.

Here is a VBA macro that combs through those Office Docs and extracts them.

It also renames them something sensible (based on the file name of the originating doc).

To use it:

1. Copy all the Office Docs (must be docx or pptx or xlsx) to a single folder.

2. Unzip all the docx/pptx/xlsx files (these are just zip files which have been renamed) to this same folder.

3. Run the macro "start_extract_PDFs_from_folders"

Sub start_extract_PDFs_from_folders()


' Extracts embedded PDFs from multiple folders within a folder.

' 2022-04-28 Edward Chan

' To use, unzip all the Word Files in a single folder, then run this macro.



' Folder traverse from https://stackoverflow.com/questions/22645347/loop-through-all-subfolders-using-vba

' PDF Extract from https://stackoverflow.com/questions/64157886/how-to-extract-embedded-pdf-from-word-document-in-linux-mac/64157887?utm_source=pocket_mylist



    

    Dim FileSystem As Object

    Dim HostFolder As String

    

    

    Dim dlgOpen As FileDialog

    Set dlgOpen = Application.FileDialog( _

    FileDialogType:=msoFileDialogFolderPicker)

    With dlgOpen

        .AllowMultiSelect = True

        .Title = "Select the folder containing all the unzipped docx folders to extract PDF file(s) from"

        .InitialFileName = "*.docx"

        .Show

    End With


    HostFolder = dlgOpen.SelectedItems.Item(1)


    Set FileSystem = CreateObject("Scripting.FileSystemObject")

    Call DoFolder(FileSystem.GetFolder(HostFolder), "word")

    Call DoFolder(FileSystem.GetFolder(HostFolder), "xl")

    Call DoFolder(FileSystem.GetFolder(HostFolder), "visio")

    Call DoFolder(FileSystem.GetFolder(HostFolder), "ppt")

End Sub


Sub DoFolder(Folder, app As String)

    Dim SubFolder

    For Each SubFolder In Folder.SubFolders

        'DoFolder SubFolder ' Uncomment this for recursive

        Debug.Print SubFolder


        Dim Contents As String

        Dim PDF As String

        Dim ZIP As String

        Dim hFile As Integer

        Dim i As Long, j As Long

        Dim ExtractedZippedDocxFolder, FileNameBin, FileNamePDF, BinFolderPath, FileNameNew, FolderUp As String

        Dim fileIndex As Integer

       

    '    Dim dlgOpen As FileDialog

    '    Set dlgOpen = Application.FileDialog( _

    '    FileDialogType:=msoFileDialogFolderPicker)

    '    With dlgOpen

    '        .AllowMultiSelect = True

    '        .Title = "Select the unzipped docx folder to extract PDF file(s) from"

    '        .InitialFileName = "*.docx"

    '        .Show

    '    End With

    '    ExtractedZippedDocxFolder = dlgOpen.SelectedItems.Item(1)

        

        ExtractedZippedDocxFolder = SubFolder

        

        BinFolderPath = ExtractedZippedDocxFolder + "\" & app & "\embeddings"

        Set objFSO = CreateObject("Scripting.FileSystemObject")

        If objFSO.FolderExists(BinFolderPath) = True Then  ' There are embedded objects

            Set objFolder = objFSO.GetFolder(BinFolderPath)

            fileIndex = 0

           

            

            For Each objFile In objFolder.Files

                If LCase$(Right$(objFile.Name, 4)) = ".bin" Then

                    FileNameIndex = Left$(objFile.Name, Len(objFile.Name) - Len(".bin"))

                    FileNameBin = BinFolderPath + "\" + FileNameIndex + ".bin"

                    FileNamePDF = BinFolderPath + "\" + FileNameIndex + ".pdf"

               

                    hFile = FreeFile

                    Open FileNameBin For Binary Access Read As #hFile

                    Contents = String(LOF(hFile), vbNullChar)

                    Get #hFile, , Contents

                    Close #hFile

               

                    i = InStrB(1, Contents, "%PDF")

                    If i <> 0 Then ' It is a PDF

                         j = InStrB(i, Contents, "%%EOF")

                         If (InStrB(j + 1, Contents, "%%EOF") > 0) Then j = InStrB(j + 1, Contents, "%%EOF")

                    

                         PDF = MidB(Contents, i, j + 5 - i + 12)

                    

                         Open FileNamePDF For Binary Access Write As #hFile

                         Put #hFile, , PDF

                         Close #hFile

                         FolderUp = Left(ExtractedZippedDocxFolder, InStrRev(ExtractedZippedDocxFolder, "\"))

                         FileNameNew = FolderUp + Right(ExtractedZippedDocxFolder, Len(ExtractedZippedDocxFolder) - InStrRev(ExtractedZippedDocxFolder, "\")) + "." + FileNameIndex + ".pdf"

                         Name FileNamePDF As FileNameNew

                    

                         fileIndex = fileIndex + 1

                    Else


                      ' Zip files

                      i = InStrB(1, Contents, "PK" & Chr(3) & Chr(4))

                      If i <> 0 Then ' It is a zip file. Just find the start and go to the end. There will be a little bit of garbage at the end but it'll be ignored by zip programs. It's too complicated to figure out where a ZIP file ends!!

                         ZIP = MidB(Contents, i, Len(Contents))

                         FileNameZIP = BinFolderPath + "\" + FileNameIndex + ".zip"


                         Open FileNameZIP For Binary Access Write As #hFile

                         Put #hFile, , ZIP

                         Close #hFile

                         FolderUp = Left(ExtractedZippedDocxFolder, InStrRev(ExtractedZippedDocxFolder, "\"))

                         FileNameNew = FolderUp + Right(ExtractedZippedDocxFolder, Len(ExtractedZippedDocxFolder) - InStrRev(ExtractedZippedDocxFolder, "\")) + "." + FileNameIndex + ".zip"

                         Name FileNameZIP As FileNameNew

                         fileIndex = fileIndex + 1

                      End If

                   End If

                Else

                   ' Check for docx files

                   If LCase$(Right$(objFile.Name, 5)) = ".docx" Then

                         FolderUp = Left(ExtractedZippedDocxFolder, InStrRev(ExtractedZippedDocxFolder, "\"))

                         FileNameNew = FolderUp + Right(ExtractedZippedDocxFolder, Len(ExtractedZippedDocxFolder) - InStrRev(ExtractedZippedDocxFolder, "\")) + "." + FileNameIndex + ".docx"

                         FileCopy objFile.Path, FileNameNew

                         fileIndex = fileIndex + 1

                   End If

                   ' Check for xlsx files

                   If LCase$(Right$(objFile.Name, 5)) = ".xlsx" Then

                         FolderUp = Left(ExtractedZippedDocxFolder, InStrRev(ExtractedZippedDocxFolder, "\"))

                         FileNameNew = FolderUp + Right(ExtractedZippedDocxFolder, Len(ExtractedZippedDocxFolder) - InStrRev(ExtractedZippedDocxFolder, "\")) + "." + FileNameIndex + ".xlsx"

                         FileCopy objFile.Path, FileNameNew

                         fileIndex = fileIndex + 1

                   End If

                   ' Check for pptx files

                   If LCase$(Right$(objFile.Name, 5)) = ".pptx" Then

                         FolderUp = Left(ExtractedZippedDocxFolder, InStrRev(ExtractedZippedDocxFolder, "\"))

                         FileNameNew = FolderUp + Right(ExtractedZippedDocxFolder, Len(ExtractedZippedDocxFolder) - InStrRev(ExtractedZippedDocxFolder, "\")) + "." + FileNameIndex + ".pptx"

                         FileCopy objFile.Path, FileNameNew

                         fileIndex = fileIndex + 1

                   End If

                   ' Check for vsdx files

                   If LCase$(Right$(objFile.Name, 5)) = ".vsdx" Then

                         FolderUp = Left(ExtractedZippedDocxFolder, InStrRev(ExtractedZippedDocxFolder, "\"))

                         FileNameNew = FolderUp + Right(ExtractedZippedDocxFolder, Len(ExtractedZippedDocxFolder) - InStrRev(ExtractedZippedDocxFolder, "\")) + "." + FileNameIndex + ".vsdx"

                         FileCopy objFile.Path, FileNameNew

                         fileIndex = fileIndex + 1

                   End If

                End If

            Next

            If fileIndex = 0 Then

                Debug.Print "Unable to find any bin file in the givven unzipped docx file content"

            Else

                Debug.Print Str(fileIndex) + "  files were processed"

            End If


        End If


    Next

'    Dim File

'    For Each File In Folder.Files

'    Next

End Sub











Sub export_PDFs()

' This is for a single file only

    Dim Contents As String

    Dim PDF As String

    Dim hFile As Integer

    Dim i As Long, j As Long

    Dim ExtractedZippedDocxFolder, FileNameBin, FileNamePDF, BinFolderPath As String

    Dim fileIndex As Integer

   

    Dim dlgOpen As FileDialog

    Set dlgOpen = Application.FileDialog( _

    FileDialogType:=msoFileDialogFolderPicker)

    With dlgOpen

        .AllowMultiSelect = False

        .Title = "Select the unzipped docx folder to extract PDF file(s) from"

        .InitialFileName = "*.docx"

        .Show

    End With

    ExtractedZippedDocxFolder = dlgOpen.SelectedItems.Item(1)

    BinFolderPath = ExtractedZippedDocxFolder + "\word\embeddings"

    Set objFSO = CreateObject("Scripting.FileSystemObject")

    Set objFolder = objFSO.GetFolder(BinFolderPath)

    fileIndex = 0

   

    For Each objFile In objFolder.Files

        If LCase$(Right$(objFile.Name, 4)) = ".bin" Then

            FileNameIndex = Left$(objFile.Name, Len(objFile.Name) - Len(".bin"))

            FileNameBin = BinFolderPath + "\" + FileNameIndex + ".bin"

            FileNamePDF = BinFolderPath + "\" + FileNameIndex + ".pdf"

       

            hFile = FreeFile

            Open FileNameBin For Binary Access Read As #hFile

            Contents = String(LOF(hFile), vbNullChar)

            Get #hFile, , Contents

            Close #hFile

       

            i = InStrB(1, Contents, "%PDF")

            If i <> 0 Then ' It is a PDF

                 j = InStrB(i, Contents, "%%EOF")

                 If (InStrB(j + 1, Contents, "%%EOF") > 0) Then j = InStrB(j + 1, Contents, "%%EOF")

            

                 PDF = MidB(Contents, i, j + 5 - i + 12)

            

                 Open FileNamePDF For Binary Access Write As #hFile

                 Put #hFile, , PDF

                 Close #hFile

                 fileIndex = fileIndex + 1

            End If

        End If

    Next

    If fileIndex = 0 Then

        MsgBox "Unable to find any bin file in the givven unzipped docx file content"

    Else

        MsgBox Str(fileIndex) + "  files were processed"

    End If

 

End Sub