Various minor changes related to file mime type detection.

- Improve or just remove some file magic signatures ported from libmagic
  that were too general and matched incorrectly too often.

- Fix MHR script's use of fa_file$mime_type before checking if it's
  initialized.  It may be uninitialized if no signatures match.

- The "fa_file" record now contains a "mime_types" field that contains
  all magic signatures that matched the file content (where the
  "mime_type" field is just a shortcut for the strongest match).
This commit is contained in:
Jon Siwek 2014-03-06 11:41:10 -06:00
parent 0865b152bb
commit 095a68b2ec
15 changed files with 187 additions and 143 deletions

View file

@ -6,6 +6,12 @@ signature file-plaintext {
} }
signature file-binary { signature file-binary {
file-magic /(.*)([^[:print:][:space:]]+)/ # Exclude bytes that can be ASCII or some ISO-8859 characters.
file-magic /(.*)([^[:print:][:space:]\xa0-\xff]+)/
file-mime "binary", -10 file-mime "binary", -10
} }
signature file-tar {
file-magic /([[:print:]\x00]){100}(([[:digit:]\x00\x20]){8}){3}/
file-mime "application/x-tar", 150
}

View file

@ -569,7 +569,7 @@ signature file-magic-auto87 {
# >>>>&0 search/1024,=\n (len=1), [""], swap_endian=0 # >>>>&0 search/1024,=\n (len=1), [""], swap_endian=0
# >>>>>&0 search/1,=@@ (len=2), ["unified diff output text"], swap_endian=0 # >>>>>&0 search/1,=@@ (len=2), ["unified diff output text"], swap_endian=0
signature file-magic-auto88 { signature file-magic-auto88 {
file-mime "text/x-diff", 40 file-mime "text/x-diff", 55
file-magic /(.*)(\x2d\x2d\x2d )(.*)(\x0a)(.*)(\x2b\x2b\x2b )(.*)(\x0a)(.*)(\x40\x40)/ file-magic /(.*)(\x2d\x2d\x2d )(.*)(\x0a)(.*)(\x2b\x2b\x2b )(.*)(\x0a)(.*)(\x40\x40)/
} }
@ -2643,7 +2643,7 @@ signature file-magic-auto388 {
# >>&0 regex,= {0,50}\(([a-zA-Z]|,| ){1,500}\):$ (len=34), ["Python script text executable"], swap_endian=0 # >>&0 regex,= {0,50}\(([a-zA-Z]|,| ){1,500}\):$ (len=34), ["Python script text executable"], swap_endian=0
signature file-magic-auto389 { signature file-magic-auto389 {
file-mime "text/x-python", 64 file-mime "text/x-python", 64
file-magic /(^( |\t){0,50}def {1,50}[a-zA-Z]{1,100})( {0,50}\(([a-zA-Z]|,| ){1,500}\):$)/ file-magic /(.*)(( |\t){0,50}def {1,50}[a-zA-Z]{1,100})( {0,50}\(([a-zA-Z]|,| ){1,500}\):$)/
} }
# >0 search/4096,=\documentstyle (len=14), ["LaTeX document text"], swap_endian=0 # >0 search/4096,=\documentstyle (len=14), ["LaTeX document text"], swap_endian=0
@ -2704,7 +2704,7 @@ signature file-magic-auto397 {
# >>>0 regex,=^[ \t]*end([ \t]*[;#].*)?$ (len=24), ["Ruby script text"], swap_endian=0 # >>>0 regex,=^[ \t]*end([ \t]*[;#].*)?$ (len=24), ["Ruby script text"], swap_endian=0
signature file-magic-auto398 { signature file-magic-auto398 {
file-mime "text/x-ruby", 54 file-mime "text/x-ruby", 54
file-magic /(^[ \x09]*require[ \x09]'[A-Za-z_\x2f]+')(include [A-Z]|def [a-z]| do$)(^[ \x09]*end([ \x09]*[;#].*)?$)/ file-magic /(.*)([ \x09]*require[ \x09]'[A-Za-z_\x2f]+')(include [A-Z]|def [a-z]| do$)(^[ \x09]*end([ \x09]*[;#].*)?$)/
} }
# >0 search/1,=eval "exec /usr/local/bin/perl (len=30), ["Perl script text"], swap_endian=0 # >0 search/1,=eval "exec /usr/local/bin/perl (len=30), ["Perl script text"], swap_endian=0
@ -2760,7 +2760,7 @@ signature file-magic-auto406 {
# >>>0 regex,=^[ \t]*end([ \t]*[;#].*)?$ (len=24), ["Ruby module source text"], swap_endian=0 # >>>0 regex,=^[ \t]*end([ \t]*[;#].*)?$ (len=24), ["Ruby module source text"], swap_endian=0
signature file-magic-auto407 { signature file-magic-auto407 {
file-mime "text/x-ruby", 54 file-mime "text/x-ruby", 54
file-magic /(^[ \x09]*(class|module)[ \x09][A-Z])((modul|includ)e [A-Z]|def [a-z])(^[ \x09]*end([ \x09]*[;#].*)?$)/ file-magic /(.*)([ \x09]*(class|module)[ \x09][A-Z])((modul|includ)e [A-Z]|def [a-z])(^[ \x09]*end([ \x09]*[;#].*)?$)/
} }
# >512 string/b,=\354\245\301 (len=3), ["Microsoft Word Document"], swap_endian=0 # >512 string/b,=\354\245\301 (len=3), ["Microsoft Word Document"], swap_endian=0
@ -2797,7 +2797,7 @@ signature file-magic-auto412 {
# >0 regex,=^from\s+(\w|\.)+\s+import.*$ (len=28), ["Python script text executable"], swap_endian=0 # >0 regex,=^from\s+(\w|\.)+\s+import.*$ (len=28), ["Python script text executable"], swap_endian=0
signature file-magic-auto413 { signature file-magic-auto413 {
file-mime "text/x-python", 58 file-mime "text/x-python", 58
file-magic /(^from\s+(\w|\.)+\s+import.*$)/ file-magic /(.*)(from\s+(\w|\.)+\s+import.*$)/
} }
# >0 search/4096,=\contentsline (len=13), ["LaTeX table of contents"], swap_endian=0 # >0 search/4096,=\contentsline (len=13), ["LaTeX table of contents"], swap_endian=0
@ -3342,11 +3342,12 @@ signature file-magic-auto497 {
file-magic /(.{4})(jP)/ file-magic /(.{4})(jP)/
} }
# Not specific enough.
# >0 regex,=^template[ \t\n]+ (len=15), ["C++ source text"], swap_endian=0 # >0 regex,=^template[ \t\n]+ (len=15), ["C++ source text"], swap_endian=0
signature file-magic-auto498 { #signature file-magic-auto498 {
file-mime "text/x-c++", 50 # file-mime "text/x-c++", 50
file-magic /(^template[ \x09\x0a]+)/ # file-magic /(.*)(template[ \x09\x0a]+)/
} #}
# >0 search/c/1,=<?php (len=5), ["PHP script text"], swap_endian=0 # >0 search/c/1,=<?php (len=5), ["PHP script text"], swap_endian=0
signature file-magic-auto499 { signature file-magic-auto499 {
@ -3417,9 +3418,46 @@ signature file-magic-auto509 {
# >0 regex,=^[ \t]{0,50}\.asciiz (len=19), ["assembler source text"], swap_endian=0 # >0 regex,=^[ \t]{0,50}\.asciiz (len=19), ["assembler source text"], swap_endian=0
signature file-magic-auto510 { signature file-magic-auto510 {
file-mime "text/x-asm", 49 file-mime "text/x-asm", 49
file-magic /(^[ \x09]{0,50}\.asciiz)/ file-magic /(^[ \x09]{0,50}\.(asciiz|asciz|section|globl|align|even|byte|file|type))/
} }
# >0 regex,=^[ \t]{0,50}\.globl (len=18), ["assembler source text"], swap_endian=0
#signature file-magic-auto517 {
# file-mime "text/x-asm", 48
# file-magic /(^[ \x09]{0,50}\.globl)/
#}
# >0 regex,=^[ \t]{0,50}\.text (len=17), ["assembler source text"], swap_endian=0
#signature file-magic-auto523 {
# file-mime "text/x-asm", 47
# file-magic /(^[ \x09]{0,50}\.text)/
#}
# >0 regex,=^[ \t]{0,50}\.even (len=17), ["assembler source text"], swap_endian=0
#signature file-magic-auto524 {
# file-mime "text/x-asm", 47
# file-magic /(^[ \x09]{0,50}\.even)/
#}
# >0 regex,=^[ \t]{0,50}\.byte (len=17), ["assembler source text"], swap_endian=0
#signature file-magic-auto525 {
# file-mime "text/x-asm", 47
# file-magic /(^[ \x09]{0,50}\.byte)/
#}
# >0 regex,=^[ \t]{0,50}\.file (len=17), ["assembler source text"], swap_endian=0
#signature file-magic-auto526 {
# file-mime "text/x-asm", 47
# file-magic /(^[ \x09]{0,50}\.file)/
#}
# >0 regex,=^[ \t]{0,50}\.type (len=17), ["assembler source text"], swap_endian=0
#signature file-magic-auto527 {
# file-mime "text/x-asm", 47
# file-magic /(^[ \x09]{0,50}\.type)/
#}
# >0 search/1,=#!/usr/bin/env perl (len=19), ["Perl script text executable"], swap_endian=0 # >0 search/1,=#!/usr/bin/env perl (len=19), ["Perl script text executable"], swap_endian=0
signature file-magic-auto511 { signature file-magic-auto511 {
file-mime "text/x-perl", 49 file-mime "text/x-perl", 49
@ -3432,11 +3470,12 @@ signature file-magic-auto512 {
file-magic /(.*)(\x3c\x21[dD][oO][cC][tT][yY][pP][eE] {1,}[hH][tT][mM][lL])/ file-magic /(.*)(\x3c\x21[dD][oO][cC][tT][yY][pP][eE] {1,}[hH][tT][mM][lL])/
} }
# This doesn't seem specific enough.
# >0 regex,=^virtual[ \t\n]+ (len=14), ["C++ source text"], swap_endian=0 # >0 regex,=^virtual[ \t\n]+ (len=14), ["C++ source text"], swap_endian=0
signature file-magic-auto513 { #signature file-magic-auto513 {
file-mime "text/x-c++", 49 # file-mime "text/x-c++", 49
file-magic /(^virtual[ \x09\x0a]+)/ # file-magic /(.*)(virtual[ \x09\x0a]+)/
} #}
# >0 search/1,=#! /usr/bin/env lua (len=19), ["Lua script text executable"], swap_endian=0 # >0 search/1,=#! /usr/bin/env lua (len=19), ["Lua script text executable"], swap_endian=0
signature file-magic-auto514 { signature file-magic-auto514 {
@ -3455,13 +3494,6 @@ signature file-magic-auto516 {
file-mime "text/x-tcl", 49 file-mime "text/x-tcl", 49
file-magic /(.*)(\x23\x21 \x2fusr\x2fbin\x2fenv tcl)/ file-magic /(.*)(\x23\x21 \x2fusr\x2fbin\x2fenv tcl)/
} }
# >0 regex,=^[ \t]{0,50}\.globl (len=18), ["assembler source text"], swap_endian=0
signature file-magic-auto517 {
file-mime "text/x-asm", 48
file-magic /(^[ \x09]{0,50}\.globl)/
}
# >0 search/1,=#!/usr/bin/env tcl (len=18), ["Tcl script text executable"], swap_endian=0 # >0 search/1,=#!/usr/bin/env tcl (len=18), ["Tcl script text executable"], swap_endian=0
signature file-magic-auto518 { signature file-magic-auto518 {
file-mime "text/x-tcl", 48 file-mime "text/x-tcl", 48
@ -3489,37 +3521,7 @@ signature file-magic-auto521 {
# >0 regex,=^class[ \t\n]+ (len=12), ["C++ source text"], swap_endian=0 # >0 regex,=^class[ \t\n]+ (len=12), ["C++ source text"], swap_endian=0
signature file-magic-auto522 { signature file-magic-auto522 {
file-mime "text/x-c++", 47 file-mime "text/x-c++", 47
file-magic /(^class[ \x09\x0a]+)/ file-magic /(.*)(class[ \x09\x0a]+[[:alnum:]_]+)(.*)(\x7b)(.*)(public:)/
}
# >0 regex,=^[ \t]{0,50}\.text (len=17), ["assembler source text"], swap_endian=0
signature file-magic-auto523 {
file-mime "text/x-asm", 47
file-magic /(^[ \x09]{0,50}\.text)/
}
# >0 regex,=^[ \t]{0,50}\.even (len=17), ["assembler source text"], swap_endian=0
signature file-magic-auto524 {
file-mime "text/x-asm", 47
file-magic /(^[ \x09]{0,50}\.even)/
}
# >0 regex,=^[ \t]{0,50}\.byte (len=17), ["assembler source text"], swap_endian=0
signature file-magic-auto525 {
file-mime "text/x-asm", 47
file-magic /(^[ \x09]{0,50}\.byte)/
}
# >0 regex,=^[ \t]{0,50}\.file (len=17), ["assembler source text"], swap_endian=0
signature file-magic-auto526 {
file-mime "text/x-asm", 47
file-magic /(^[ \x09]{0,50}\.file)/
}
# >0 regex,=^[ \t]{0,50}\.type (len=17), ["assembler source text"], swap_endian=0
signature file-magic-auto527 {
file-mime "text/x-asm", 47
file-magic /(^[ \x09]{0,50}\.type)/
} }
# >0 search/1,=This is Info file (len=17), ["GNU Info text"], swap_endian=0 # >0 search/1,=This is Info file (len=17), ["GNU Info text"], swap_endian=0
@ -3717,11 +3719,12 @@ signature file-magic-auto553 {
file-magic /(.*)(\x5cinput texinfo)/ file-magic /(.*)(\x5cinput texinfo)/
} }
# Not specific enough.
# >0 regex,=^private: (len=9), ["C++ source text"], swap_endian=0 # >0 regex,=^private: (len=9), ["C++ source text"], swap_endian=0
signature file-magic-auto554 { #signature file-magic-auto554 {
file-mime "text/x-c++", 44 # file-mime "text/x-c++", 44
file-magic /(^private:)/ # file-magic /(.*)(private:)/
} #}
# >0 search/4096,=def __init__ (len=12), [""], swap_endian=0 # >0 search/4096,=def __init__ (len=12), [""], swap_endian=0
# >>&0 search/64,=self (len=4), ["Python script text executable"], swap_endian=0 # >>&0 search/64,=self (len=4), ["Python script text executable"], swap_endian=0
@ -3739,7 +3742,7 @@ signature file-magic-auto556 {
# >0 regex,=^extern[ \t\n]+ (len=13), ["C source text"], swap_endian=0 # >0 regex,=^extern[ \t\n]+ (len=13), ["C source text"], swap_endian=0
signature file-magic-auto557 { signature file-magic-auto557 {
file-mime "text/x-c", 43 file-mime "text/x-c", 43
file-magic /(^extern[ \x09\x0a]+)/ file-magic /(.*)(extern[ \x09\x0a]+)/
} }
# >0 search/4096,=% -*-latex-*- (len=13), ["LaTeX document text"], swap_endian=0 # >0 search/4096,=% -*-latex-*- (len=13), ["LaTeX document text"], swap_endian=0
@ -3748,16 +3751,17 @@ signature file-magic-auto558 {
file-magic /(.*)(\x25 \x2d\x2a\x2dlatex\x2d\x2a\x2d)/ file-magic /(.*)(\x25 \x2d\x2a\x2dlatex\x2d\x2a\x2d)/
} }
# Doesn't seem specific enough.
# >0 regex,=^double[ \t\n]+ (len=13), ["C source text"], swap_endian=0 # >0 regex,=^double[ \t\n]+ (len=13), ["C source text"], swap_endian=0
signature file-magic-auto559 { #signature file-magic-auto559 {
file-mime "text/x-c", 43 # file-mime "text/x-c", 43
file-magic /(^double[ \x09\x0a]+)/ # file-magic /(^double[ \x09\x0a]+)/
} #}
# >0 regex,=^struct[ \t\n]+ (len=13), ["C source text"], swap_endian=0 # >0 regex,=^struct[ \t\n]+ (len=13), ["C source text"], swap_endian=0
signature file-magic-auto560 { signature file-magic-auto560 {
file-mime "text/x-c", 43 file-mime "text/x-c", 43
file-magic /(^struct[ \x09\x0a]+)/ file-magic /(.*)(struct[ \x09\x0a]+)/
} }
# >0 search/w/1,=#!/bin/nodejs (len=13), ["Node.js script text executable"], swap_endian=0 # >0 search/w/1,=#!/bin/nodejs (len=13), ["Node.js script text executable"], swap_endian=0
@ -3766,11 +3770,12 @@ signature file-magic-auto561 {
file-magic /(.*)(\x23\x21\x2fbin\x2fnodejs)/ file-magic /(.*)(\x23\x21\x2fbin\x2fnodejs)/
} }
# Not specific enough.
# >0 regex,=^public: (len=8), ["C++ source text"], swap_endian=0 # >0 regex,=^public: (len=8), ["C++ source text"], swap_endian=0
signature file-magic-auto562 { #signature file-magic-auto562 {
file-mime "text/x-c++", 43 # file-mime "text/x-c++", 43
file-magic /(^public:)/ # file-magic /(.*)(public:)/
} #}
# >0 search/wct/4096,=<script (len=7), ["HTML document text"], swap_endian=0 # >0 search/wct/4096,=<script (len=7), ["HTML document text"], swap_endian=0
signature file-magic-auto563 { signature file-magic-auto563 {
@ -3778,17 +3783,19 @@ signature file-magic-auto563 {
file-magic /(.*)(\x3c[sS][cC][rR][iI][pP][tT])/ file-magic /(.*)(\x3c[sS][cC][rR][iI][pP][tT])/
} }
# Doesn't seem specific enough.
# >0 regex,=^float[ \t\n]+ (len=12), ["C source text"], swap_endian=0 # >0 regex,=^float[ \t\n]+ (len=12), ["C source text"], swap_endian=0
signature file-magic-auto564 { #signature file-magic-auto564 {
file-mime "text/x-c", 42 # file-mime "text/x-c", 42
file-magic /(^float[ \x09\x0a]+)/ # file-magic /(^float[ \x09\x0a]+)/
} #}
# Doesn't seem specific enough.
# >0 regex,=^union[ \t\n]+ (len=12), ["C source text"], swap_endian=0 # >0 regex,=^union[ \t\n]+ (len=12), ["C source text"], swap_endian=0
signature file-magic-auto565 { #signature file-magic-auto565 {
file-mime "text/x-c", 42 # file-mime "text/x-c", 42
file-magic /(^union[ \x09\x0a]+)/ # file-magic /(^union[ \x09\x0a]+)/
} #}
# The use of non-sequential offsets and relational operations made the # The use of non-sequential offsets and relational operations made the
# autogenerated signature incorrrect. # autogenerated signature incorrrect.
@ -3810,7 +3817,7 @@ signature file-magic-auto567 {
# >0 regex,=^char[ \t\n]+ (len=11), ["C source text"], swap_endian=0 # >0 regex,=^char[ \t\n]+ (len=11), ["C source text"], swap_endian=0
signature file-magic-auto568 { signature file-magic-auto568 {
file-mime "text/x-c", 41 file-mime "text/x-c", 41
file-magic /(^char[ \x09\x0a]+)/ file-magic /(.*)(char[ \x09\x0a]+)/
} }
# >0 search/1,=#! (len=2), [""], swap_endian=0 # >0 search/1,=#! (len=2), [""], swap_endian=0
@ -3911,11 +3918,12 @@ signature file-magic-auto581 {
file-magic /(.*)(main\x28)/ file-magic /(.*)(main\x28)/
} }
# Not specific enough.
# >0 search/1,=\" (len=2), ["troff or preprocessor input text"], swap_endian=0 # >0 search/1,=\" (len=2), ["troff or preprocessor input text"], swap_endian=0
signature file-magic-auto582 { #signature file-magic-auto582 {
file-mime "text/troff", 40 # file-mime "text/troff", 40
file-magic /(.*)(\x5c\x22)/ # file-magic /(.*)(\x5c\x22)/
} #}
# >0 search/4096,=(defparam (len=10), ["Lisp/Scheme program text"], swap_endian=0 # >0 search/4096,=(defparam (len=10), ["Lisp/Scheme program text"], swap_endian=0
signature file-magic-auto583 { signature file-magic-auto583 {
@ -3929,16 +3937,17 @@ signature file-magic-auto584 {
file-magic /(.*)(\x28autoload )/ file-magic /(.*)(\x28autoload )/
} }
#This signature seems too generic.
# >0 search/1,=diff (len=5), ["diff output text"], swap_endian=0 # >0 search/1,=diff (len=5), ["diff output text"], swap_endian=0
signature file-magic-auto585 { #signature file-magic-auto585 {
file-mime "text/x-diff", 40 # file-mime "text/x-diff", 40
file-magic /(.*)(diff )/ # file-magic /(.*)(diff )/
} #}
# >0 regex,=^#include (len=9), ["C source text"], swap_endian=0 # >0 regex,=^#include (len=9), ["C source text"], swap_endian=0
signature file-magic-auto586 { signature file-magic-auto586 {
file-mime "text/x-c", 39 file-mime "text/x-c", 39
file-magic /(^#include)/ file-magic /(.*)(#include)/
} }
# >0 search/1,=.\" (len=3), ["troff or preprocessor input text"], swap_endian=0 # >0 search/1,=.\" (len=3), ["troff or preprocessor input text"], swap_endian=0
@ -4006,7 +4015,7 @@ signature file-magic-auto596 {
# >0 regex,=^SUBDIRS (len=8), ["automake makefile script text"], swap_endian=0 # >0 regex,=^SUBDIRS (len=8), ["automake makefile script text"], swap_endian=0
signature file-magic-auto597 { signature file-magic-auto597 {
file-mime "text/x-makefile", 38 file-mime "text/x-makefile", 38
file-magic /(^SUBDIRS)/ file-magic /(.*)(SUBDIRS)/
} }
# >0 search/4096,=(defvar (len=8), ["Lisp/Scheme program text"], swap_endian=0 # >0 search/4096,=(defvar (len=8), ["Lisp/Scheme program text"], swap_endian=0
@ -4015,11 +4024,12 @@ signature file-magic-auto598 {
file-magic /(.*)(\x28defvar )/ file-magic /(.*)(\x28defvar )/
} }
# Not specific enough.
# >0 regex,=^program (len=8), ["Pascal source text"], swap_endian=0 # >0 regex,=^program (len=8), ["Pascal source text"], swap_endian=0
signature file-magic-auto599 { #signature file-magic-auto599 {
file-mime "text/x-pascal", 38 # file-mime "text/x-pascal", 38
file-magic /(^program)/ # file-magic /(^program)/
} #}
# >0 search/1,=Only in (len=8), ["diff output text"], swap_endian=0 # >0 search/1,=Only in (len=8), ["diff output text"], swap_endian=0
signature file-magic-auto600 { signature file-magic-auto600 {
@ -4027,11 +4037,12 @@ signature file-magic-auto600 {
file-magic /(.*)(Only in )/ file-magic /(.*)(Only in )/
} }
# This signature doesn't seem specific enough.
# >0 search/1,=*** (len=4), ["diff output text"], swap_endian=0 # >0 search/1,=*** (len=4), ["diff output text"], swap_endian=0
signature file-magic-auto601 { #signature file-magic-auto601 {
file-mime "text/x-diff", 38 # file-mime "text/x-diff", 38
file-magic /(.*)(\x2a\x2a\x2a )/ # file-magic /(.*)(\x2a\x2a\x2a )/
} #}
# >0 search/1,='.\" (len=4), ["troff or preprocessor input text"], swap_endian=0 # >0 search/1,='.\" (len=4), ["troff or preprocessor input text"], swap_endian=0
signature file-magic-auto602 { signature file-magic-auto602 {
@ -4039,11 +4050,12 @@ signature file-magic-auto602 {
file-magic /(.*)(\x27\x2e\x5c\x22)/ file-magic /(.*)(\x27\x2e\x5c\x22)/
} }
# LDFLAGS appears in other contexts, e.g. shell script.
# >0 regex,=^LDFLAGS (len=8), ["makefile script text"], swap_endian=0 # >0 regex,=^LDFLAGS (len=8), ["makefile script text"], swap_endian=0
signature file-magic-auto603 { #signature file-magic-auto603 {
file-mime "text/x-makefile", 38 # file-mime "text/x-makefile", 38
file-magic /(^LDFLAGS)/ # file-magic /(.*)(LDFLAGS)/
} #}
# >0 search/8192,="libhdr" (len=8), ["BCPL source text"], swap_endian=0 # >0 search/8192,="libhdr" (len=8), ["BCPL source text"], swap_endian=0
signature file-magic-auto604 { signature file-magic-auto604 {
@ -4051,16 +4063,17 @@ signature file-magic-auto604 {
file-magic /(.*)(\x22libhdr\x22)/ file-magic /(.*)(\x22libhdr\x22)/
} }
# Not specific enough.
# >0 regex,=^record (len=7), ["Pascal source text"], swap_endian=0 # >0 regex,=^record (len=7), ["Pascal source text"], swap_endian=0
signature file-magic-auto605 { #signature file-magic-auto605 {
file-mime "text/x-pascal", 37 # file-mime "text/x-pascal", 37
file-magic /(^record)/ # file-magic /(^record)/
} #}
# >0 regex,=^CFLAGS (len=7), ["makefile script text"], swap_endian=0 # >0 regex,=^CFLAGS (len=7), ["makefile script text"], swap_endian=0
signature file-magic-auto606 { signature file-magic-auto606 {
file-mime "text/x-makefile", 37 file-mime "text/x-makefile", 37
file-magic /(^CFLAGS)/ file-magic /(.*)(CFLAGS)/
} }
# >0 search/4096,=(defun (len=7), ["Lisp/Scheme program text"], swap_endian=0 # >0 search/4096,=(defun (len=7), ["Lisp/Scheme program text"], swap_endian=0
@ -4081,11 +4094,12 @@ signature file-magic-auto609 {
file-magic /(.*)(\x28input\x2c)/ file-magic /(.*)(\x28input\x2c)/
} }
# Not specific enough.
# >0 search/1,=Index: (len=6), ["RCS/CVS diff output text"], swap_endian=0 # >0 search/1,=Index: (len=6), ["RCS/CVS diff output text"], swap_endian=0
signature file-magic-auto610 { #signature file-magic-auto610 {
file-mime "text/x-diff", 36 # file-mime "text/x-diff", 44
file-magic /(.*)(Index\x3a)/ # file-magic /(.*)(Index\x3a)/
} #}
# >0 search/4096,=(setq (len=6), ["Lisp/Scheme program text"], swap_endian=0 # >0 search/4096,=(setq (len=6), ["Lisp/Scheme program text"], swap_endian=0
signature file-magic-auto611 { signature file-magic-auto611 {

View file

@ -65,10 +65,11 @@ export {
## A set of analysis types done during the file analysis. ## A set of analysis types done during the file analysis.
analyzers: set[string] &log; analyzers: set[string] &log;
## A mime type provided by libmagic against the *bof_buffer* ## A mime type provided by the strongest file magic signature
## field of :bro:see:`fa_file`, or in the cases where no ## match against the *bof_buffer* field of :bro:see:`fa_file`,
## buffering of the beginning of file occurs, an initial ## or in the cases where no buffering of the beginning of file
## guess of the mime type based on the first data seen. ## occurs, an initial guess of the mime type based on the first
## data seen.
mime_type: string &log &optional; mime_type: string &log &optional;
## A filename for the file if one is available from the source ## A filename for the file if one is available from the source

View file

@ -396,10 +396,15 @@ type fa_file: record {
## This is also the buffer that's used for file/mime type detection. ## This is also the buffer that's used for file/mime type detection.
bof_buffer: string &optional; bof_buffer: string &optional;
## A mime type provided by libmagic against the *bof_buffer*, or ## The mime type of the strongest file magic signature matches against
## in the cases where no buffering of the beginning of file occurs, ## the data chunk in *bof_buffer*, or in the cases where no buffering
## an initial guess of the mime type based on the first data seen. ## of the beginning of file occurs, an initial guess of the mime type
## based on the first data seen.
mime_type: string &optional; mime_type: string &optional;
## All mime types that matched file magic signatures against the data
## chunk in *bof_buffer*, in order of their strength value.
mime_types: mime_matches &optional;
} &redef; } &redef;
## Fields of a SYN packet. ## Fields of a SYN packet.

View file

@ -37,7 +37,7 @@ export {
event file_hash(f: fa_file, kind: string, hash: string) event file_hash(f: fa_file, kind: string, hash: string)
{ {
if ( kind=="sha1" && match_file_types in f$mime_type ) if ( kind=="sha1" && f?$mime_type && match_file_types in f$mime_type )
{ {
local hash_domain = fmt("%s.malware.hash.cymru.com", hash); local hash_domain = fmt("%s.malware.hash.cymru.com", hash);
when ( local MHR_result = lookup_hostname_txt(hash_domain) ) when ( local MHR_result = lookup_hostname_txt(hash_domain) )

View file

@ -872,24 +872,7 @@ function file_magic%(data: string%): mime_matches
%{ %{
RuleMatcher::MIME_Matches matches; RuleMatcher::MIME_Matches matches;
file_mgr->DetectMIME(data->Bytes(), data->Len(), &matches); file_mgr->DetectMIME(data->Bytes(), data->Len(), &matches);
VectorVal* rval = new VectorVal(mime_matches); return file_analysis::GenMIMEMatchesVal(matches);
for ( RuleMatcher::MIME_Matches::const_iterator it = matches.begin();
it != matches.end(); ++it )
{
RecordVal* element = new RecordVal(mime_match);
for ( set<string>::const_iterator it2 = it->second.begin();
it2 != it->second.end(); ++it2 )
{
element->Assign(0, new Val(it->first, TYPE_INT));
element->Assign(1, new StringVal(*it2));
}
rval->Assign(rval->Size(), element);
}
return rval;
%} %}
## Performs an entropy test on the given data. ## Performs an entropy test on the given data.

View file

@ -53,6 +53,7 @@ int File::timeout_interval_idx = -1;
int File::bof_buffer_size_idx = -1; int File::bof_buffer_size_idx = -1;
int File::bof_buffer_idx = -1; int File::bof_buffer_idx = -1;
int File::mime_type_idx = -1; int File::mime_type_idx = -1;
int File::mime_types_idx = -1;
void File::StaticInit() void File::StaticInit()
{ {
@ -73,6 +74,7 @@ void File::StaticInit()
bof_buffer_size_idx = Idx("bof_buffer_size"); bof_buffer_size_idx = Idx("bof_buffer_size");
bof_buffer_idx = Idx("bof_buffer"); bof_buffer_idx = Idx("bof_buffer");
mime_type_idx = Idx("mime_type"); mime_type_idx = Idx("mime_type");
mime_types_idx = Idx("mime_types");
} }
File::File(const string& file_id, Connection* conn, analyzer::Tag tag, File::File(const string& file_id, Connection* conn, analyzer::Tag tag,
@ -280,12 +282,15 @@ bool File::BufferBOF(const u_char* data, uint64 len)
bool File::DetectMIME(const u_char* data, uint64 len) bool File::DetectMIME(const u_char* data, uint64 len)
{ {
string strongest_match = file_mgr->DetectMIME(data, len); RuleMatcher::MIME_Matches matches;
file_mgr->DetectMIME(data, len, &matches);
if ( strongest_match.empty() ) if ( matches.empty() )
return false; return false;
val->Assign(mime_type_idx, new StringVal(strongest_match)); val->Assign(mime_type_idx,
new StringVal(*(matches.begin()->second.begin())));
val->Assign(mime_types_idx, file_analysis::GenMIMEMatchesVal(matches));
return true; return true;
} }

View file

@ -283,6 +283,7 @@ private:
static int bof_buffer_size_idx; static int bof_buffer_size_idx;
static int bof_buffer_idx; static int bof_buffer_idx;
static int mime_type_idx; static int mime_type_idx;
static int mime_types_idx;
}; };
} // namespace file_analysis } // namespace file_analysis

View file

@ -425,3 +425,25 @@ string Manager::DetectMIME(const u_char* data, uint64 len) const
return *(matches.begin()->second.begin()); return *(matches.begin()->second.begin());
} }
VectorVal* file_analysis::GenMIMEMatchesVal(const RuleMatcher::MIME_Matches& m)
{
VectorVal* rval = new VectorVal(mime_matches);
for ( RuleMatcher::MIME_Matches::const_iterator it = m.begin();
it != m.end(); ++it )
{
RecordVal* element = new RecordVal(mime_match);
for ( set<string>::const_iterator it2 = it->second.begin();
it2 != it->second.end(); ++it2 )
{
element->Assign(0, new Val(it->first, TYPE_INT));
element->Assign(1, new StringVal(*it2));
}
rval->Assign(rval->Size(), element);
}
return rval;
}

View file

@ -285,6 +285,7 @@ public:
*/ */
std::string DetectMIME(const u_char* data, uint64 len) const; std::string DetectMIME(const u_char* data, uint64 len) const;
protected: protected:
friend class FileTimer; friend class FileTimer;
@ -370,6 +371,12 @@ private:
static string salt; /**< A salt added to file handles before hashing. */ static string salt; /**< A salt added to file handles before hashing. */
}; };
/**
* Returns a script-layer value corresponding to the \c mime_matches type.
* @param m The MIME match information with which to populate the value.
*/
VectorVal* GenMIMEMatchesVal(const RuleMatcher::MIME_Matches& m);
} // namespace file_analysis } // namespace file_analysis
extern file_analysis::Manager* file_mgr; extern file_analysis::Manager* file_mgr;

View file

@ -41,7 +41,7 @@ export {
event file_hash(f: fa_file, kind: string, hash: string) event file_hash(f: fa_file, kind: string, hash: string)
{ {
if ( kind=="sha1" && match_file_types in f$mime_type ) if ( kind=="sha1" && f?$mime_type && match_file_types in f$mime_type )
{ {
local hash_domain = fmt("%s.malware.hash.cymru.com", hash); local hash_domain = fmt("%s.malware.hash.cymru.com", hash);
when ( local MHR_result = lookup_hostname_txt(hash_domain) ) when ( local MHR_result = lookup_hostname_txt(hash_domain) )

View file

@ -4,7 +4,7 @@ detect-MHR.bro
event file_hash(f: fa_file, kind: string, hash: string) event file_hash(f: fa_file, kind: string, hash: string)
{ {
if ( kind=="sha1" && match_file_types in f$mime_type ) if ( kind=="sha1" && f?$mime_type && match_file_types in f$mime_type )
{ {
local hash_domain = fmt("%s.malware.hash.cymru.com", hash); local hash_domain = fmt("%s.malware.hash.cymru.com", hash);
when ( local MHR_result = lookup_hostname_txt(hash_domain) ) when ( local MHR_result = lookup_hostname_txt(hash_domain) )

View file

@ -16,16 +16,16 @@
#empty_field (empty) #empty_field (empty)
#unset_field - #unset_field -
#path mime_metrics #path mime_metrics
#open 2014-03-03-22-45-14 #open 2014-03-06-17-30-44
#fields ts ts_delta mtype uniq_hosts hits bytes #fields ts ts_delta mtype uniq_hosts hits bytes
#types time interval string count count count #types time interval string count count count
1389719059.311698 300.000000 text/html 1 4 53070 1389719059.311698 300.000000 text/html 1 4 53070
1389719059.311698 300.000000 image/jpeg 1 1 186859 1389719059.311698 300.000000 image/jpeg 1 1 186859
1389719059.311698 300.000000 text/troff 1 1 3180
1389719059.311698 300.000000 application/pgp-signature 1 1 836 1389719059.311698 300.000000 application/pgp-signature 1 1 836
1389719059.311698 300.000000 binary 1 1 3180
1389719059.311698 300.000000 text/plain 1 12 113982 1389719059.311698 300.000000 text/plain 1 12 113982
1389719059.311698 300.000000 image/gif 1 1 172 1389719059.311698 300.000000 image/gif 1 1 172
1389719059.311698 300.000000 image/png 1 9 82176 1389719059.311698 300.000000 image/png 1 9 82176
1389719059.311698 300.000000 image/x-icon 1 2 2300 1389719059.311698 300.000000 image/x-icon 1 2 2300
#close 2014-03-03-22-45-14 #close 2014-03-06-17-30-44

View file

@ -41,7 +41,7 @@ export {
event file_hash(f: fa_file, kind: string, hash: string) event file_hash(f: fa_file, kind: string, hash: string)
{ {
if ( kind=="sha1" && match_file_types in f$mime_type ) if ( kind=="sha1" && f?$mime_type && match_file_types in f$mime_type )
{ {
local hash_domain = fmt("%s.malware.hash.cymru.com", hash); local hash_domain = fmt("%s.malware.hash.cymru.com", hash);
when ( local MHR_result = lookup_hostname_txt(hash_domain) ) when ( local MHR_result = lookup_hostname_txt(hash_domain) )

View file

@ -4,7 +4,7 @@ detect-MHR.bro
event file_hash(f: fa_file, kind: string, hash: string) event file_hash(f: fa_file, kind: string, hash: string)
{ {
if ( kind=="sha1" && match_file_types in f$mime_type ) if ( kind=="sha1" && f?$mime_type && match_file_types in f$mime_type )
{ {
local hash_domain = fmt("%s.malware.hash.cymru.com", hash); local hash_domain = fmt("%s.malware.hash.cymru.com", hash);
when ( local MHR_result = lookup_hostname_txt(hash_domain) ) when ( local MHR_result = lookup_hostname_txt(hash_domain) )