The avro specification allows using different write and read schema provided they match. The specification further allows aliases to cater for differences between the read and write schema. The following python 2.7 tries to illustrate this.
import uuid
import avro.schema
import json
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter
write_schema = {
"namespace": "example.avro",
"type": "record",
"name": "User",
"fields": [
{"name": "name", "type": "string"},
{"name": "favorite_number", "type": ["int", "null"]},
{"name": "favorite_color", "type": ["string", "null"]}
]
}
writer = DataFileWriter(open("users.avro", "wb"), DatumWriter(write_schema))
writer.append({"name": "Alyssa", "favorite_number": 256})
writer.append({"name": "Ben", "favorite_number": 7, "favorite_color": "red"})
writer.close()
read_schema = {
"namespace": "example.avro",
"type": "record",
"name": "User",
"fields": [
{"name": "first_name", "type": "string", "aliases": ["name"]},
{"name": "favorite_number", "type": ["int", "null"]},
{"name": "favorite_color", "type": ["string", "null"]}
]
}
# 1. open avro and extract passport + data
reader = DataFileReader(open("users.avro", "rb"), DatumReader(write_schema, read_schema))
reader.close()
This code has the following error message:
/Library/Frameworks/Python.framework/Versions/2.7/bin/python2.7 /Users/simonshapiro/python_beam/src/avrov_test.py
Traceback (most recent call last):
File "/Users/simonshapiro/python_beam/src/avrov_test.py", line 67, in <module>
writer.append({"name": "Alyssa", "favorite_number": 256})
File "/Library/Python/2.7/site-packages/avro/datafile.py", line 196, in append
self.datum_writer.write(datum, self.buffer_encoder)
File "/Library/Python/2.7/site-packages/avro/io.py", line 768, in write
if not validate(self.writers_schema, datum):
File "/Library/Python/2.7/site-packages/avro/io.py", line 103, in validate
schema_type = expected_schema.type
AttributeError: 'dict' object has no attribute 'type'
Process finished with exit code 1
When it is run without different schema using this line
reader = DataFileReader(open("users.avro", "rb"), DatumReader())
it works fine.