Go4Expert

Go4Expert (http://www.go4expert.com/)
-   Python (http://www.go4expert.com/forums/python/)
-   -   text extraction PDFNetSDK please help (http://www.go4expert.com/forums/text-extraction-pdfnetsdk-help-t29756/)

sujan.dasmahapatra 6Aug2013 10:36

text extraction PDFNetSDK please help
 
I am trying to extract text from PDF file in PDFNetSDK by PDFTron. But when the execution comes to dumpAlltext() there is no element type e_text. When I am debuggin the code i can see element type is e_path..I think e_path is having the all the text in it. How can I extract the text from e_path??..Please help.
Code:

if sys.version_info.major < 3:
    from PDFNetPython2 import *
else:
    from PDFNetPython3 import *

def printStyle (style):
    print(" style=\"font-family:" + style.GetFontName() + "; font-size:"
          + str(style.GetFontSize()) + "; sans-serif: " + str(style.IsSerif())
          + "; color:" + str(style.GetColor())+ "\"")

def dumpAllText (reader):
    element = reader.Next()
    while element != None:
        type = element.GetType()
        if type == Element.e_text_begin:
            print("Text Block Begin")
        elif type == Element.e_text_end:
            print("Text Block End")
        elif type == Element.e_text:
            bbox = element.GetBBox()
            print("BBox: " + str(bbox.GetX1()) + ", " + str(bbox.GetY1()) + ", "
                  + str(bbox.GetX2()) + ", " + str(bbox.GetY2()))
            print(element.GetTextString())
        elif type == Element.e_text_new_line:
            print("New Line")
        elif type == Element.e_form:
            reader.FormBegin()
            dumpAllText(reader)
            reader.End()
        elif type == Element.e_group_begin:
            print("Group begins")
        elif type == Element.e_group_end:
            print("Group ends")
        elif type == Element.e_path:
            bbox = element.GetBBox()
            print("BBox: " + str(bbox.GetX1()) + ", " + str(bbox.GetY1()) + ", "
                  + str(bbox.GetX2()) + ", " + str(bbox.GetY2()))
            print(element.GetTextData())
        element = reader.Next()

def main():
    PDFNet.Initialize()

    # Relative path to the folder containing test files.
    input_path =  "test.pdf"
    example5_low_level = True

        if example5_low_level:
        doc = PDFDoc(input_path)
        doc.InitSecurityHandler()

        # Example 1. Extract all text content from the document

        reader = ElementReader()
        itr = doc.GetPageIterator()
        while itr.HasNext():
            reader.Begin(itr.Current())
            dumpAllText(reader)
            reader.End()
            itr.Next()

if __name__ == '__main__':
    main()



All times are GMT +5.5. The time now is 13:00.