Formularios con Google Document AI
Análisis de documentos PDF y extracción de campos de formularios (Form parser) con Google Document AI
- 1. Introducción
- 2. Instalación
- 3. Llamada al Form Parser de Document AI
- 4. Función para parsear la respuesta del API
- 5. Impresión de resultados
1. Introducción
Este ejemplo muestra cómo analizar un formulario con el Form Parser de Document AI (en formato pdf) y extraer ciertas entidades, como nombre, dirección, teléfono, .... como lista en python y como cajas con coordenadas X, Y para resaltarlos en el propio pdf.
Contenido original en inglés y adaptación de aqui
!pip3 install google-cloud-documentai
!pip3 install wand
!pip3 install pillo
#!apt-get update
#!apt-get install poppler-utils # for converting pdf to jpg. We'll use this for displaying the pdf later
#!apt-get install libmagickwand-dev
Hay que hacer un reset del runtime antes de continuar.
from google.cloud import documentai_v1beta2 as documentai
from wand.image import Image as WImage
from PIL import Image, ImageDraw
import os
Configuramos el proyecto en Google Cloud:
# En caso de usar Colab, ejecutar lo siguiente
from google.colab import auth
auth.authenticate_user()
#@title Set Project Id
PROJECT_ID = 'PROJECT_ID' #@param {type: "string"}
PDF_URI = "PDF_FILE_STORED_IN_GCS" #@param {type: "string"}
SERVICE_ACCOUNT_NAME="SERVICE_ACCOUNT_NAME" #@param {type: "string"}
Al usar la libraría python de Document AI, necesitamos crear cuenta de servicio y descargar la clave:
!gcloud config set project '{PROJECT_ID}'
# Uncomment the following line to create a new service account
#!gcloud iam service-accounts create {SERVICE_ACCOUNT_NAME}
!gcloud iam service-accounts keys create ./key.json --iam-account {SERVICE_ACCOUNT_NAME}@{PROJECT_ID}.iam.gserviceaccount.com
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./key.json"
Habilitar dos APIs: la de Document AI y la de Invoice AI
!gcloud services enable documentai.googleapis.com
!gcloud services enable invoice.googleapis.com
# Descargar el archivo
!gsutil cp $PDF_URI ./doc.pdf
def parse_form(project_id=PROJECT_ID,
input_uri=PDF_URI):
"""Parse a form using the Document AI API"""
# Create a new Document AI client
client = documentai.DocumentUnderstandingServiceClient()
# Specify which cloud in GCS you'd like to analyze
gcs_source = documentai.types.GcsSource(uri=input_uri)
# mime_type can be application/pdf, image/tiff,
# and image/gif, or application/json
input_config = documentai.types.InputConfig(
gcs_source=gcs_source, mime_type='application/pdf')
# Optional: Improve form parsing results by providing
# key-value pair hints.
# For each key hint, key is text that is likely to appear in the
# document as a form field name (i.e. "DOB").
# Value types are optional, but can be one or more of:
# ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
# NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
key_value_pair_hints = [
documentai.types.KeyValuePairHint(key='Emergency Contact',
value_types=['NAME']),
documentai.types.KeyValuePairHint(
key='Referred By')
]
# Setting enabled=True enables form extraction
form_extraction_params = documentai.types.FormExtractionParams(
enabled=True, key_value_pair_hints=key_value_pair_hints)
# Location can be 'us' or 'eu'
parent = 'projects/{}/locations/us'.format(project_id)
request = documentai.types.ProcessDocumentRequest(
parent=parent,
input_config=input_config,
form_extraction_params=form_extraction_params)
document = client.process_document(request=request)
return document
doc = parse_form(PROJECT_ID)
Respuesta del API:
# Número de páginas del documento
len(doc.pages)
# Número de campos detectados
len(doc.pages[0].form_fields)
# Campos en detalle
doc.pages[0].form_fields[0]
def get_text(document, el):
"""Doc AI identifies form fields by their offsets
in document text. This function converts offsets
to text snippets.
Parameters:
doc (documentai.proto): Proto returned from docai api
el (documentai.entity): Single entity from the doc
Returns:
array of {"x": float, "y": float} bounding box of the entity
"""
response = ''
# If a text segment spans several lines, it will
# be stored in different text segments.
for segment in el.text_anchor.text_segments:
start_index = segment.start_index
end_index = segment.end_index
response += document.text[start_index:end_index]
return response
# Form fields are given as character offsets in text:
# text_anchor {
# text_segments {
# start_index: 325
# end_index: 327
# }
# }
# To convert text offsets to actual words, we'll use the helper function get_text
for form_field in doc.pages[0].form_fields:
field_name = get_text(doc, form_field.field_name).strip()
field_value = get_text(doc, form_field.field_value).strip()
print(f"{field_name}\t{field_value}")
!pdfimages -j doc.pdf doc
Imprimimos entidades y dibujamos las cajas alrededor:
im = Image.open('doc-000.jpg')
draw = ImageDraw.Draw(im)
for form_field in doc.pages[0].form_fields:
# Draw the bounding box around the form_fields
# Forst get the co-ords of the field name
vertices = []
for vertex in form_field.field_name.bounding_poly.normalized_vertices:
vertices.append({'x': vertex.x * im.size[0], 'y': vertex.y * im.size[1]})
draw.polygon([
vertices[0]['x'], vertices[0]['y'],
vertices[1]['x'], vertices[1]['y'],
vertices[2]['x'], vertices[2]['y'],
vertices[3]['x'], vertices[3]['y']], outline='red')
vertices = []
for vertex in form_field.field_value.bounding_poly.normalized_vertices:
vertices.append({'x': vertex.x * im.size[0], 'y': vertex.y * im.size[1]})
draw.polygon([
vertices[0]['x'], vertices[0]['y'],
vertices[1]['x'], vertices[1]['y'],
vertices[2]['x'], vertices[2]['y'],
vertices[3]['x'], vertices[3]['y']], outline='blue')
im