1. Introducción

Este ejemplo muestra cómo analizar un formulario con el Form Parser de Document AI (en formato pdf) y extraer ciertas entidades, como nombre, dirección, teléfono, .... como lista en python y como cajas con coordenadas X, Y para resaltarlos en el propio pdf.

Contenido original en inglés y adaptación de aqui

2. Instalación

!pip3 install google-cloud-documentai
!pip3 install wand
!pip3 install pillo
#!apt-get update
#!apt-get install poppler-utils # for converting pdf to jpg. We'll use this for displaying the pdf later
#!apt-get install libmagickwand-dev

Hay que hacer un reset del runtime antes de continuar.

from google.cloud import documentai_v1beta2 as documentai
from wand.image import Image as WImage
from PIL import Image, ImageDraw
import os

Configuramos el proyecto en Google Cloud:

# En caso de usar Colab, ejecutar lo siguiente
from google.colab import auth
auth.authenticate_user()
#@title Set Project Id
PROJECT_ID = 'PROJECT_ID'  #@param {type: "string"}
PDF_URI = "PDF_FILE_STORED_IN_GCS" #@param {type: "string"}
SERVICE_ACCOUNT_NAME="SERVICE_ACCOUNT_NAME" #@param {type: "string"}

Al usar la libraría python de Document AI, necesitamos crear cuenta de servicio y descargar la clave:

!gcloud config set project '{PROJECT_ID}'
# Uncomment the following line to create a new service account
#!gcloud iam service-accounts create {SERVICE_ACCOUNT_NAME}
!gcloud iam service-accounts keys create ./key.json --iam-account {SERVICE_ACCOUNT_NAME}@{PROJECT_ID}.iam.gserviceaccount.com
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./key.json"

Habilitar dos APIs: la de Document AI y la de Invoice AI

!gcloud services enable documentai.googleapis.com
!gcloud services enable invoice.googleapis.com
# Descargar el archivo
!gsutil cp $PDF_URI ./doc.pdf

3. Llamada al Form Parser de Document AI

def parse_form(project_id=PROJECT_ID,
               input_uri=PDF_URI):
    """Parse a form using the Document AI API"""

    # Create a new Document AI client
    client = documentai.DocumentUnderstandingServiceClient()

    # Specify which cloud in GCS you'd like to analyze
    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(
        gcs_source=gcs_source, mime_type='application/pdf')

    # Optional: Improve form parsing results by providing 
    # key-value pair hints.
    # For each key hint, key is text that is likely to appear in the
    # document as a form field name (i.e. "DOB").
    # Value types are optional, but can be one or more of:
    # ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
    # NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
    key_value_pair_hints = [
        documentai.types.KeyValuePairHint(key='Emergency Contact',
                                          value_types=['NAME']),
        documentai.types.KeyValuePairHint(
            key='Referred By')
    ]

    # Setting enabled=True enables form extraction
    form_extraction_params = documentai.types.FormExtractionParams(
        enabled=True, key_value_pair_hints=key_value_pair_hints)

    # Location can be 'us' or 'eu'
    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent,
        input_config=input_config,
        form_extraction_params=form_extraction_params)

    document = client.process_document(request=request)
    
    return document
doc = parse_form(PROJECT_ID)

Respuesta del API:

# Número de páginas del documento
len(doc.pages)
# Número de campos detectados
len(doc.pages[0].form_fields)
# Campos en detalle
doc.pages[0].form_fields[0]

4. Función para parsear la respuesta del API

def get_text(document, el):
    """Doc AI identifies form fields by their offsets
    in document text. This function converts offsets
    to text snippets.
    
     Parameters:
    doc (documentai.proto): Proto returned from docai api
    el (documentai.entity): Single entity from the doc

    Returns:
    array of {"x": float, "y": float} bounding box of the entity
    """
    response = ''
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    for segment in el.text_anchor.text_segments:
        start_index = segment.start_index
        end_index = segment.end_index
        response += document.text[start_index:end_index]
    return response
# Form fields are given as character offsets in text:
#  text_anchor {
#     text_segments {
#       start_index: 325
#       end_index: 327
#     }
#   }
# To convert text offsets to actual words, we'll use the helper function get_text

for form_field in doc.pages[0].form_fields:
  field_name = get_text(doc, form_field.field_name).strip()
  field_value = get_text(doc, form_field.field_value).strip()
  print(f"{field_name}\t{field_value}")

5. Impresión de resultados

Descargamos el archivo y convertimos a jpg, para dibujar las cajas.

!pdfimages -j doc.pdf doc

Imprimimos entidades y dibujamos las cajas alrededor:

im = Image.open('doc-000.jpg')
draw = ImageDraw.Draw(im)
for form_field in doc.pages[0].form_fields:
    # Draw the bounding box around the form_fields
    # Forst get the co-ords of the field name
    vertices = []
    for vertex in form_field.field_name.bounding_poly.normalized_vertices:
      vertices.append({'x': vertex.x * im.size[0], 'y': vertex.y * im.size[1]})
    draw.polygon([
        vertices[0]['x'], vertices[0]['y'],
        vertices[1]['x'], vertices[1]['y'],
        vertices[2]['x'], vertices[2]['y'],
        vertices[3]['x'], vertices[3]['y']], outline='red')
    
    vertices = []
    for vertex in form_field.field_value.bounding_poly.normalized_vertices:
        vertices.append({'x': vertex.x * im.size[0], 'y': vertex.y * im.size[1]})
    draw.polygon([
        vertices[0]['x'], vertices[0]['y'],
        vertices[1]['x'], vertices[1]['y'],
        vertices[2]['x'], vertices[2]['y'],
        vertices[3]['x'], vertices[3]['y']], outline='blue')
im