Read and write schema when using the python avro library
Asked Answered
S

1

8

The avro specification allows using different write and read schema provided they match. The specification further allows aliases to cater for differences between the read and write schema. The following python 2.7 tries to illustrate this.

import uuid
import avro.schema
import json
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter


write_schema = {
    "namespace": "example.avro",
    "type": "record",
    "name": "User",
    "fields": [
         {"name": "name", "type": "string"},
         {"name": "favorite_number", "type": ["int", "null"]},
         {"name": "favorite_color", "type": ["string", "null"]}
     ]
}
writer = DataFileWriter(open("users.avro", "wb"), DatumWriter(write_schema))
writer.append({"name": "Alyssa", "favorite_number": 256})
writer.append({"name": "Ben", "favorite_number": 7, "favorite_color": "red"})
writer.close()

read_schema = {
    "namespace": "example.avro",
    "type": "record",
    "name": "User",
    "fields": [
        {"name": "first_name", "type": "string", "aliases": ["name"]},
        {"name": "favorite_number", "type": ["int", "null"]},
        {"name": "favorite_color", "type": ["string", "null"]}
    ]
}

# 1. open avro and extract passport + data
reader = DataFileReader(open("users.avro", "rb"), DatumReader(write_schema, read_schema))
reader.close()

This code has the following error message:

/Library/Frameworks/Python.framework/Versions/2.7/bin/python2.7 /Users/simonshapiro/python_beam/src/avrov_test.py
Traceback (most recent call last):
  File "/Users/simonshapiro/python_beam/src/avrov_test.py", line 67, in <module>
    writer.append({"name": "Alyssa", "favorite_number": 256})
  File "/Library/Python/2.7/site-packages/avro/datafile.py", line 196, in append
    self.datum_writer.write(datum, self.buffer_encoder)
  File "/Library/Python/2.7/site-packages/avro/io.py", line 768, in write
    if not validate(self.writers_schema, datum):
  File "/Library/Python/2.7/site-packages/avro/io.py", line 103, in validate
    schema_type = expected_schema.type
AttributeError: 'dict' object has no attribute 'type'

Process finished with exit code 1

When it is run without different schema using this line

reader = DataFileReader(open("users.avro", "rb"), DatumReader())

it works fine.

Slusher answered 11/6, 2017 at 19:7 Comment(0)
S
9

Well after some more work I have discovered that the schemas were not set up correctly. This code works as intended:

import uuid
import avro.schema
import json
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter


write_schema = avro.schema.parse(json.dumps({
    "namespace": "example.avro",
    "type": "record",
    "name": "User",
    "fields": [
         {"name": "name", "type": "string"},
         {"name": "favorite_number", "type": ["int", "null"]},
         {"name": "favorite_color", "type": ["string", "null"]}
     ]
}))

writer = DataFileWriter(open("users.avro", "wb"), DatumWriter(), write_schema)
writer.append({"name": "Alyssa", "favorite_number": 256})
writer.append({"name": "Ben", "favorite_number": 7, "favorite_color": "red"})
writer.close()

read_schema = avro.schema.parse(json.dumps({
    "namespace": "example.avro",
    "type": "record",
    "name": "User",
    "fields": [
        {"name": "first_name", "type": "string", "default": "", "aliases": ["name"]},
        {"name": "favorite_number", "type": ["int", "null"]},
        {"name": "favorite_color", "type": ["string", "null"]}
    ]
}))

# 1. open avro and extract passport + data
reader = DataFileReader(open("users.avro", "rb"), DatumReader(write_schema, read_schema))
new_schema = reader.get_meta("avro.schema")
users = []
for user in reader:
    users.append(user)
reader.close()
Slusher answered 11/6, 2017 at 20:48 Comment(0)

© 2022 - 2024 — McMap. All rights reserved.