text extraction PDFNetSDK please help

Discussion in 'Python' started by sujan.dasmahapatra, Aug 6, 2013.

  1. sujan.dasmahapatra

    sujan.dasmahapatra Member

    Joined:
    Jun 11, 2009
    Messages:
    39
    Likes Received:
    0
    Trophy Points:
    6
    Gender:
    Male
    I am trying to extract text from PDF file in PDFNetSDK by PDFTron. But when the execution comes to dumpAlltext() there is no element type e_text. When I am debuggin the code i can see element type is e_path..I think e_path is having the all the text in it. How can I extract the text from e_path??..Please help.
    Code:
    if sys.version_info.major < 3:
        from PDFNetPython2 import *
    else:
        from PDFNetPython3 import *
    
    def printStyle (style):
        print(" style=\"font-family:" + style.GetFontName() + "; font-size:"
              + str(style.GetFontSize()) + "; sans-serif: " + str(style.IsSerif())
              + "; color:" + str(style.GetColor())+ "\"")
    
    def dumpAllText (reader):
        element = reader.Next()
        while element != None:
            type = element.GetType()
            if type == Element.e_text_begin:
                print("Text Block Begin")
            elif type == Element.e_text_end:
                print("Text Block End")
            elif type == Element.e_text:
                bbox = element.GetBBox()
                print("BBox: " + str(bbox.GetX1()) + ", " + str(bbox.GetY1()) + ", "
                      + str(bbox.GetX2()) + ", " + str(bbox.GetY2()))
                print(element.GetTextString())
            elif type == Element.e_text_new_line:
                print("New Line")
            elif type == Element.e_form:
                reader.FormBegin()
                dumpAllText(reader)
                reader.End()
            elif type == Element.e_group_begin:
                print("Group begins")
            elif type == Element.e_group_end:
                print("Group ends")
            elif type == Element.e_path:
                bbox = element.GetBBox()
                print("BBox: " + str(bbox.GetX1()) + ", " + str(bbox.GetY1()) + ", "
                      + str(bbox.GetX2()) + ", " + str(bbox.GetY2()))
                print(element.GetTextData())
            element = reader.Next()
    
    def main():
        PDFNet.Initialize()
    
        # Relative path to the folder containing test files.
        input_path =  "test.pdf"
        example5_low_level = True
    
            if example5_low_level:
            doc = PDFDoc(input_path)
            doc.InitSecurityHandler()
    
            # Example 1. Extract all text content from the document
    
            reader = ElementReader()
            itr = doc.GetPageIterator()
            while itr.HasNext():
                reader.Begin(itr.Current())
                dumpAllText(reader)
                reader.End()
                itr.Next()
    
    if __name__ == '__main__':
        main()
    
     

Share This Page

  1. This site uses cookies to help personalise content, tailor your experience and to keep you logged in if you register.
    By continuing to use this site, you are consenting to our use of cookies.
    Dismiss Notice