A solution to detecting a file's Media Type1 has the following parts:
Please remember to give credit if you copy the code.
StreamMediaType.java
In the following code -1
means skip comparing the byte at that index; a -2
denotes end of file type signature. This detects binary formats, primarily images, and a few plain text format variations (HTML, SVG, XML). The code uses up to the first 11 "magic" bytes from the data source's header. Optimizations and improvements that shorten the logic are welcome.
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.LinkedHashMap;
import java.util.Map;
import static com.keenwrite.io.MediaType.*;
import static java.lang.System.arraycopy;
public class StreamMediaType {
private static final int FORMAT_LENGTH = 11;
private static final int END_OF_DATA = -2;
private static final Map<int[], MediaType> FORMAT = new LinkedHashMap<>();
static {
//@formatter:off
FORMAT.put( ints( 0x3C, 0x73, 0x76, 0x67, 0x20 ), IMAGE_SVG_XML );
FORMAT.put( ints( 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A ), IMAGE_PNG );
FORMAT.put( ints( 0xFF, 0xD8, 0xFF, 0xE0 ), IMAGE_JPEG );
FORMAT.put( ints( 0xFF, 0xD8, 0xFF, 0xEE ), IMAGE_JPEG );
FORMAT.put( ints( 0xFF, 0xD8, 0xFF, 0xE1, -1, -1, 0x45, 0x78, 0x69, 0x66, 0x00 ), IMAGE_JPEG );
FORMAT.put( ints( 0x49, 0x49, 0x2A, 0x00 ), IMAGE_TIFF );
FORMAT.put( ints( 0x4D, 0x4D, 0x00, 0x2A ), IMAGE_TIFF );
FORMAT.put( ints( 0x47, 0x49, 0x46, 0x38 ), IMAGE_GIF );
FORMAT.put( ints( 0x8A, 0x4D, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A ), VIDEO_MNG );
FORMAT.put( ints( 0x25, 0x50, 0x44, 0x46, 0x2D, 0x31, 0x2E ), APP_PDF );
FORMAT.put( ints( 0x38, 0x42, 0x50, 0x53, 0x00, 0x01 ), IMAGE_PHOTOSHOP );
FORMAT.put( ints( 0x25, 0x21, 0x50, 0x53, 0x2D, 0x41, 0x64, 0x6F, 0x62, 0x65, 0x2D ), APP_EPS );
FORMAT.put( ints( 0x25, 0x21, 0x50, 0x53 ), APP_PS );
FORMAT.put( ints( 0xFF, 0xFB, 0x30 ), AUDIO_MP3 );
FORMAT.put( ints( 0x49, 0x44, 0x33 ), AUDIO_MP3 );
FORMAT.put( ints( 0x3C, 0x21 ), TEXT_HTML );
FORMAT.put( ints( 0x3C, 0x68, 0x74, 0x6D, 0x6C ), TEXT_HTML );
FORMAT.put( ints( 0x3C, 0x68, 0x65, 0x61, 0x64 ), TEXT_HTML );
FORMAT.put( ints( 0x3C, 0x62, 0x6F, 0x64, 0x79 ), TEXT_HTML );
FORMAT.put( ints( 0x3C, 0x48, 0x54, 0x4D, 0x4C ), TEXT_HTML );
FORMAT.put( ints( 0x3C, 0x48, 0x45, 0x41, 0x44 ), TEXT_HTML );
FORMAT.put( ints( 0x3C, 0x42, 0x4F, 0x44, 0x59 ), TEXT_HTML );
FORMAT.put( ints( 0x3C, 0x3F, 0x78, 0x6D, 0x6C, 0x20 ), TEXT_XML );
FORMAT.put( ints( 0xFE, 0xFF, 0x00, 0x3C, 0x00, 0x3f, 0x00, 0x78 ), TEXT_XML );
FORMAT.put( ints( 0xFF, 0xFE, 0x3C, 0x00, 0x3F, 0x00, 0x78, 0x00 ), TEXT_XML );
FORMAT.put( ints( 0x42, 0x4D ), IMAGE_BMP );
FORMAT.put( ints( 0x23, 0x64, 0x65, 0x66 ), IMAGE_X_BITMAP );
FORMAT.put( ints( 0x21, 0x20, 0x58, 0x50, 0x4D, 0x32 ), IMAGE_X_PIXMAP );
FORMAT.put( ints( 0x2E, 0x73, 0x6E, 0x64 ), AUDIO_BASIC );
FORMAT.put( ints( 0x64, 0x6E, 0x73, 0x2E ), AUDIO_BASIC );
FORMAT.put( ints( 0x52, 0x49, 0x46, 0x46 ), AUDIO_WAV );
FORMAT.put( ints( 0x50, 0x4B ), APP_ZIP );
FORMAT.put( ints( 0x41, 0x43, -1, -1, -1, -1, 0x00, 0x00, 0x00, 0x00, 0x00 ), APP_ACAD );
FORMAT.put( ints( 0xCA, 0xFE, 0xBA, 0xBE ), APP_JAVA );
FORMAT.put( ints( 0xAC, 0xED ), APP_JAVA_OBJECT );
//@formatter:on
}
private StreamMediaType() {
}
public static MediaType getMediaType( final Path path ) throws IOException {
return getMediaType( path.toFile() );
}
public static MediaType getMediaType( final java.io.File file )
throws IOException {
try( final var fis = new FileInputStream( file ) ) {
return getMediaType( fis );
}
}
public static MediaType getMediaType( final InputStream is )
throws IOException {
final var input = new byte[ FORMAT_LENGTH ];
final var count = is.read( input, 0, FORMAT_LENGTH );
if( count > 1 ) {
final var available = new byte[ count ];
arraycopy( input, 0, available, 0, count );
return getMediaType( available );
}
return UNDEFINED;
}
public static MediaType getMediaType( final byte[] data ) {
assert data != null;
final var source = new int[]{
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
for( int i = 0; i < source.length; i++ ) {
source[ i ] = data[ i ] & 0xFF;
}
for( final var key : FORMAT.keySet() ) {
int i = -1;
boolean matches = true;
while( ++i < FORMAT_LENGTH && key[ i ] != END_OF_DATA && matches ) {
matches = key[ i ] == source[ i ] || key[ i ] == -1;
}
if( matches ) {
return FORMAT.get( key );
}
}
return UNDEFINED;
}
private static int[] ints( final int... data ) {
final var magic = new int[ FORMAT_LENGTH ];
int i = -1;
while( ++i < data.length ) {
magic[ i ] = data[ i ];
}
while( i < FORMAT_LENGTH ) {
magic[ i++ ] = END_OF_DATA;
}
return magic;
}
}
MediaType.java
Define the file formats according to the IANA Media Type list. Notice that the file name extensions are mapped in MediaTypeExtension
. There's a dependency on Apache's FilenameUtils class for its getExtension
function.
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import static MediaType.TypeName.*;
import static MediaTypeExtension.getMediaType;
import static org.apache.commons.io.FilenameUtils.getExtension;
public enum MediaType {
APP_ACAD( APPLICATION, "acad" ),
APP_JAVA_OBJECT( APPLICATION, "x-java-serialized-object" ),
APP_JAVA( APPLICATION, "java" ),
APP_PS( APPLICATION, "postscript" ),
APP_EPS( APPLICATION, "eps" ),
APP_PDF( APPLICATION, "pdf" ),
APP_ZIP( APPLICATION, "zip" ),
FONT_OTF( "otf" ),
FONT_TTF( "ttf" ),
IMAGE_APNG( "apng" ),
IMAGE_ACES( "aces" ),
IMAGE_AVCI( "avci" ),
IMAGE_AVCS( "avcs" ),
IMAGE_BMP( "bmp" ),
IMAGE_CGM( "cgm" ),
IMAGE_DICOM_RLE( "dicom_rle" ),
IMAGE_EMF( "emf" ),
IMAGE_EXAMPLE( "example" ),
IMAGE_FITS( "fits" ),
IMAGE_G3FAX( "g3fax" ),
IMAGE_GIF( "gif" ),
IMAGE_HEIC( "heic" ),
IMAGE_HEIF( "heif" ),
IMAGE_HEJ2K( "hej2k" ),
IMAGE_HSJ2( "hsj2" ),
IMAGE_X_ICON( "x-icon" ),
IMAGE_JLS( "jls" ),
IMAGE_JP2( "jp2" ),
IMAGE_JPEG( "jpeg" ),
IMAGE_JPH( "jph" ),
IMAGE_JPHC( "jphc" ),
IMAGE_JPM( "jpm" ),
IMAGE_JPX( "jpx" ),
IMAGE_JXR( "jxr" ),
IMAGE_JXRA( "jxrA" ),
IMAGE_JXRS( "jxrS" ),
IMAGE_JXS( "jxs" ),
IMAGE_JXSC( "jxsc" ),
IMAGE_JXSI( "jxsi" ),
IMAGE_JXSS( "jxss" ),
IMAGE_KTX( "ktx" ),
IMAGE_KTX2( "ktx2" ),
IMAGE_NAPLPS( "naplps" ),
IMAGE_PNG( "png" ),
IMAGE_PHOTOSHOP( "photoshop" ),
IMAGE_SVG_XML( "svg+xml" ),
IMAGE_T38( "t38" ),
IMAGE_TIFF( "tiff" ),
IMAGE_WEBP( "webp" ),
IMAGE_WMF( "wmf" ),
IMAGE_X_BITMAP( "x-xbitmap" ),
IMAGE_X_PIXMAP( "x-xpixmap" ),
AUDIO_BASIC( AUDIO, "basic" ),
AUDIO_MP3( AUDIO, "mp3" ),
AUDIO_WAV( AUDIO, "x-wav" ),
VIDEO_MNG( VIDEO, "x-mng" ),
TEXT_HTML( TEXT, "html" ),
TEXT_MARKDOWN( TEXT, "markdown" ),
TEXT_PLAIN( TEXT, "plain" ),
TEXT_XHTML( TEXT, "xhtml+xml" ),
TEXT_XML( TEXT, "xml" ),
TEXT_YAML( TEXT, "yaml" ),
/*
* When all other lights go out.
*/
UNDEFINED( TypeName.UNDEFINED, "undefined" );
public enum TypeName {
APPLICATION,
AUDIO,
IMAGE,
TEXT,
UNDEFINED,
VIDEO
}
private final String mMediaType;
private final TypeName mTypeName;
private final String mSubtype;
MediaType( final String subtype ) {
this( IMAGE, subtype );
}
MediaType( final TypeName typeName, final String subtype ) {
mTypeName = typeName;
mSubtype = subtype;
mMediaType = typeName.toString().toLowerCase() + '/' + subtype;
}
public static MediaType valueFrom( final File file ) {
assert file != null;
return fromFilename( file.getName() );
}
public static MediaType fromFilename( final String filename ) {
assert filename != null;
return getMediaType( getExtension( filename ) );
}
public static MediaType valueFrom( final Path path ) {
assert path != null;
return valueFrom( path.toFile() );
}
public static MediaType valueFrom( String contentType ) {
if( contentType == null || contentType.isBlank() ) {
return UNDEFINED;
}
var i = contentType.indexOf( ';' );
contentType = contentType.substring(
0, i == -1 ? contentType.length() : i );
i = contentType.indexOf( '/' );
i = i == -1 ? contentType.length() : i;
final var type = contentType.substring( 0, i );
final var subtype = contentType.substring( i + 1 );
return valueFrom( type, subtype );
}
public static MediaType valueFrom(
final String type, final String subtype ) {
assert type != null;
assert subtype != null;
for( final var mediaType : values() ) {
if( mediaType.equals( type, subtype ) ) {
return mediaType;
}
}
return UNDEFINED;
}
public boolean equals( final String type, final String subtype ) {
assert type != null;
assert subtype != null;
return mTypeName.name().equalsIgnoreCase( type ) &&
mSubtype.equalsIgnoreCase( subtype );
}
public boolean isType( final TypeName typeName ) {
return mTypeName == typeName;
}
public String getSubtype() {
return mSubtype;
}
@Override
public String toString() {
return mMediaType;
}
}
MediaTypeExtension.java
Last piece of the puzzle is a map of MediaType
s to their known and common/popular file name extensions. This allows bidirectional lookup based on file name extensions.
import static MediaType.*;
import static java.util.List.of;
public enum MediaTypeExtension {
MEDIA_APP_ACAD( APP_ACAD, of( "dwg" ) ),
MEDIA_APP_PDF( APP_PDF ),
MEDIA_APP_PS( APP_PS, of( "ps" ) ),
MEDIA_APP_EPS( APP_EPS ),
MEDIA_APP_ZIP( APP_ZIP ),
MEDIA_AUDIO_MP3( AUDIO_MP3 ),
MEDIA_AUDIO_BASIC( AUDIO_BASIC, of( "au" ) ),
MEDIA_AUDIO_WAV( AUDIO_WAV, of( "wav" ) ),
MEDIA_FONT_OTF( FONT_OTF ),
MEDIA_FONT_TTF( FONT_TTF ),
MEDIA_IMAGE_APNG( IMAGE_APNG ),
MEDIA_IMAGE_BMP( IMAGE_BMP ),
MEDIA_IMAGE_GIF( IMAGE_GIF ),
MEDIA_IMAGE_JPEG( IMAGE_JPEG,
of( "jpg", "jpe", "jpeg", "jfif", "pjpeg", "pjp" ) ),
MEDIA_IMAGE_PNG( IMAGE_PNG ),
MEDIA_IMAGE_PSD( IMAGE_PHOTOSHOP, of( "psd" ) ),
MEDIA_IMAGE_SVG( IMAGE_SVG_XML, of( "svg" ) ),
MEDIA_IMAGE_TIFF( IMAGE_TIFF, of( "tiff", "tif" ) ),
MEDIA_IMAGE_WEBP( IMAGE_WEBP ),
MEDIA_IMAGE_X_BITMAP( IMAGE_X_BITMAP, of( "xbm" ) ),
MEDIA_IMAGE_X_PIXMAP( IMAGE_X_PIXMAP, of( "xpm" ) ),
MEDIA_VIDEO_MNG( VIDEO_MNG, of( "mng" ) ),
MEDIA_TEXT_MARKDOWN( TEXT_MARKDOWN, of(
"md", "markdown", "mdown", "mdtxt", "mdtext", "mdwn", "mkd", "mkdown",
"mkdn" ) ),
MEDIA_TEXT_PLAIN( TEXT_PLAIN, of( "txt", "asc", "ascii", "text", "utxt" ) ),
MEDIA_TEXT_R_MARKDOWN( TEXT_R_MARKDOWN, of( "Rmd" ) ),
MEDIA_TEXT_R_XML( TEXT_R_XML, of( "Rxml" ) ),
MEDIA_TEXT_XHTML( TEXT_XHTML, of( "xhtml" ) ),
MEDIA_TEXT_XML( TEXT_XML ),
MEDIA_TEXT_YAML( TEXT_YAML, of( "yaml", "yml" ) ),
MEDIA_UNDEFINED( UNDEFINED, of( "undefined" ) );
private final MediaType mMediaType;
private final List<String> mExtensions;
MediaTypeExtension( final MediaType mediaType ) {
this( mediaType, of( mediaType.getSubtype() ) );
}
MediaTypeExtension(
final MediaType mediaType, final List<String> extensions ) {
assert mediaType != null;
assert extensions != null;
assert !extensions.isEmpty();
mMediaType = mediaType;
mExtensions = extensions;
}
public String getExtension() {
return mExtensions.get( 0 );
}
public static MediaTypeExtension valueFrom( final MediaType mediaType ) {
for( final var type : values() ) {
if( type.isMediaType( mediaType ) ) {
return type;
}
}
return MEDIA_UNDEFINED;
}
boolean isMediaType( final MediaType mediaType ) {
return mMediaType == mediaType;
}
static MediaType getMediaType( final String extension ) {
final var sanitized = sanitize( extension );
for( final var mediaType : MediaTypeExtension.values() ) {
if( mediaType.isType( sanitized ) ) {
return mediaType.getMediaType();
}
}
return UNDEFINED;
}
private boolean isType( final String sanitized ) {
for( final var extension : mExtensions ) {
if( extension.equalsIgnoreCase( sanitized ) ) {
return true;
}
}
return false;
}
private static String sanitize( final String extension ) {
return extension == null ? "" : extension.toLowerCase();
}
private MediaType getMediaType() {
return mMediaType;
}
}
Usages:
// EXAMPLE -- Detect media type
//
final File image = new File( "filename.jpg" );
final MediaType mt = StreamMediaType.getMediaType( image );
// Tricky! The JPG could be a PNG in disguise.
if( mt.isType( MediaType.TypeName.IMAGE ) ) {
if( mt == MediaType.IMAGE_PNG ) {
// Nice try! Sneaky sneak.
}
}
// EXAMPLE -- Get typical media type file name extension
//
final String ext = MediaTypeExtension.valueFrom( MediaType.IMAGE_SVG_XML ).getExtension();
// EXAMPLE -- Get media type from HTTP request
//
final var url = new URL( "https://localhost/path/file.ext" );
final var conn = (HttpURLConnection) url.openConnection();
final var contentType = conn.getContentType();
MediaType mediaType = valueFrom( contentType );
// Fall back to stream detection probe
if( mediaType == UNDEFINED ) {
mediaType = StreamMediaType.getMediaType( conn.getInputStream() );
}
conn.disconnect();
You get the idea.
Short library review:
Sample audio, video, and image files for testing:
Note that nearly all XML documents will begin the same way:
<?xml version="1.0" standalone="no"?>
Since SVG documents are XML documents, many SVG documents will contain that XML declaration and may also contain:
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.0//EN" "http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd">
Detecting the SVG doctype would be possible by bumping the magic bytes from 11 to 13. Still, the doctype is not required, meaning that the SVG document could also begin after the XML declaration as follows:
<svg xmlns="http://www.w3.org/2000/svg">
Meaning, use caution when using this code to detect SVG file formats, as it is not reliable. Instead, consider using the HTTP Content-Type or filename extension.
Compounding the issue is that comments of arbitrary length can be inserted before the <svg
tag, making detection extra-difficult.
1 "MIME type" is a deprecated term.