From 095a68b2ec0ea97bd7da20888effdda0069f3cfc Mon Sep 17 00:00:00 2001 From: Jon Siwek Date: Thu, 6 Mar 2014 11:41:10 -0600 Subject: [PATCH] Various minor changes related to file mime type detection. - Improve or just remove some file magic signatures ported from libmagic that were too general and matched incorrectly too often. - Fix MHR script's use of fa_file$mime_type before checking if it's initialized. It may be uninitialized if no signatures match. - The "fa_file" record now contains a "mime_types" field that contains all magic signatures that matched the file content (where the "mime_type" field is just a shortcut for the strongest match). --- .../base/frameworks/files/magic/general.sig | 8 +- .../base/frameworks/files/magic/libmagic.sig | 226 ++++++++++-------- scripts/base/frameworks/files/main.bro | 9 +- scripts/base/init-bare.bro | 11 +- .../policy/frameworks/files/detect-MHR.bro | 2 +- src/bro.bif | 19 +- src/file_analysis/File.cc | 11 +- src/file_analysis/File.h | 1 + src/file_analysis/Manager.cc | 22 ++ src/file_analysis/Manager.h | 7 + .../output | 2 +- .../output | 2 +- .../btest-doc.sphinx.mimestats#1 | 6 +- ...licy_frameworks_files_detect-MHR_bro.btest | 2 +- ...cy_frameworks_files_detect-MHR_bro@4.btest | 2 +- 15 files changed, 187 insertions(+), 143 deletions(-) diff --git a/scripts/base/frameworks/files/magic/general.sig b/scripts/base/frameworks/files/magic/general.sig index 595bcc2f62..ce252dd218 100644 --- a/scripts/base/frameworks/files/magic/general.sig +++ b/scripts/base/frameworks/files/magic/general.sig @@ -6,6 +6,12 @@ signature file-plaintext { } signature file-binary { - file-magic /(.*)([^[:print:][:space:]]+)/ + # Exclude bytes that can be ASCII or some ISO-8859 characters. + file-magic /(.*)([^[:print:][:space:]\xa0-\xff]+)/ file-mime "binary", -10 } + +signature file-tar { + file-magic /([[:print:]\x00]){100}(([[:digit:]\x00\x20]){8}){3}/ + file-mime "application/x-tar", 150 +} diff --git a/scripts/base/frameworks/files/magic/libmagic.sig b/scripts/base/frameworks/files/magic/libmagic.sig index 25f5ba8c0f..3fa8d47ab5 100644 --- a/scripts/base/frameworks/files/magic/libmagic.sig +++ b/scripts/base/frameworks/files/magic/libmagic.sig @@ -569,7 +569,7 @@ signature file-magic-auto87 { # >>>>&0 search/1024,=\n (len=1), [""], swap_endian=0 # >>>>>&0 search/1,=@@ (len=2), ["unified diff output text"], swap_endian=0 signature file-magic-auto88 { - file-mime "text/x-diff", 40 + file-mime "text/x-diff", 55 file-magic /(.*)(\x2d\x2d\x2d )(.*)(\x0a)(.*)(\x2b\x2b\x2b )(.*)(\x0a)(.*)(\x40\x40)/ } @@ -2643,7 +2643,7 @@ signature file-magic-auto388 { # >>&0 regex,= {0,50}\(([a-zA-Z]|,| ){1,500}\):$ (len=34), ["Python script text executable"], swap_endian=0 signature file-magic-auto389 { file-mime "text/x-python", 64 - file-magic /(^( |\t){0,50}def {1,50}[a-zA-Z]{1,100})( {0,50}\(([a-zA-Z]|,| ){1,500}\):$)/ + file-magic /(.*)(( |\t){0,50}def {1,50}[a-zA-Z]{1,100})( {0,50}\(([a-zA-Z]|,| ){1,500}\):$)/ } # >0 search/4096,=\documentstyle (len=14), ["LaTeX document text"], swap_endian=0 @@ -2704,7 +2704,7 @@ signature file-magic-auto397 { # >>>0 regex,=^[ \t]*end([ \t]*[;#].*)?$ (len=24), ["Ruby script text"], swap_endian=0 signature file-magic-auto398 { file-mime "text/x-ruby", 54 - file-magic /(^[ \x09]*require[ \x09]'[A-Za-z_\x2f]+')(include [A-Z]|def [a-z]| do$)(^[ \x09]*end([ \x09]*[;#].*)?$)/ + file-magic /(.*)([ \x09]*require[ \x09]'[A-Za-z_\x2f]+')(include [A-Z]|def [a-z]| do$)(^[ \x09]*end([ \x09]*[;#].*)?$)/ } # >0 search/1,=eval "exec /usr/local/bin/perl (len=30), ["Perl script text"], swap_endian=0 @@ -2760,7 +2760,7 @@ signature file-magic-auto406 { # >>>0 regex,=^[ \t]*end([ \t]*[;#].*)?$ (len=24), ["Ruby module source text"], swap_endian=0 signature file-magic-auto407 { file-mime "text/x-ruby", 54 - file-magic /(^[ \x09]*(class|module)[ \x09][A-Z])((modul|includ)e [A-Z]|def [a-z])(^[ \x09]*end([ \x09]*[;#].*)?$)/ + file-magic /(.*)([ \x09]*(class|module)[ \x09][A-Z])((modul|includ)e [A-Z]|def [a-z])(^[ \x09]*end([ \x09]*[;#].*)?$)/ } # >512 string/b,=\354\245\301 (len=3), ["Microsoft Word Document"], swap_endian=0 @@ -2797,7 +2797,7 @@ signature file-magic-auto412 { # >0 regex,=^from\s+(\w|\.)+\s+import.*$ (len=28), ["Python script text executable"], swap_endian=0 signature file-magic-auto413 { file-mime "text/x-python", 58 - file-magic /(^from\s+(\w|\.)+\s+import.*$)/ + file-magic /(.*)(from\s+(\w|\.)+\s+import.*$)/ } # >0 search/4096,=\contentsline (len=13), ["LaTeX table of contents"], swap_endian=0 @@ -3342,11 +3342,12 @@ signature file-magic-auto497 { file-magic /(.{4})(jP)/ } +# Not specific enough. # >0 regex,=^template[ \t\n]+ (len=15), ["C++ source text"], swap_endian=0 -signature file-magic-auto498 { - file-mime "text/x-c++", 50 - file-magic /(^template[ \x09\x0a]+)/ -} +#signature file-magic-auto498 { +# file-mime "text/x-c++", 50 +# file-magic /(.*)(template[ \x09\x0a]+)/ +#} # >0 search/c/1,=0 regex,=^[ \t]{0,50}\.asciiz (len=19), ["assembler source text"], swap_endian=0 signature file-magic-auto510 { file-mime "text/x-asm", 49 - file-magic /(^[ \x09]{0,50}\.asciiz)/ + file-magic /(^[ \x09]{0,50}\.(asciiz|asciz|section|globl|align|even|byte|file|type))/ } +# >0 regex,=^[ \t]{0,50}\.globl (len=18), ["assembler source text"], swap_endian=0 +#signature file-magic-auto517 { +# file-mime "text/x-asm", 48 +# file-magic /(^[ \x09]{0,50}\.globl)/ +#} + +# >0 regex,=^[ \t]{0,50}\.text (len=17), ["assembler source text"], swap_endian=0 +#signature file-magic-auto523 { +# file-mime "text/x-asm", 47 +# file-magic /(^[ \x09]{0,50}\.text)/ +#} + +# >0 regex,=^[ \t]{0,50}\.even (len=17), ["assembler source text"], swap_endian=0 +#signature file-magic-auto524 { +# file-mime "text/x-asm", 47 +# file-magic /(^[ \x09]{0,50}\.even)/ +#} + +# >0 regex,=^[ \t]{0,50}\.byte (len=17), ["assembler source text"], swap_endian=0 +#signature file-magic-auto525 { +# file-mime "text/x-asm", 47 +# file-magic /(^[ \x09]{0,50}\.byte)/ +#} + +# >0 regex,=^[ \t]{0,50}\.file (len=17), ["assembler source text"], swap_endian=0 +#signature file-magic-auto526 { +# file-mime "text/x-asm", 47 +# file-magic /(^[ \x09]{0,50}\.file)/ +#} + +# >0 regex,=^[ \t]{0,50}\.type (len=17), ["assembler source text"], swap_endian=0 +#signature file-magic-auto527 { +# file-mime "text/x-asm", 47 +# file-magic /(^[ \x09]{0,50}\.type)/ +#} + + # >0 search/1,=#!/usr/bin/env perl (len=19), ["Perl script text executable"], swap_endian=0 signature file-magic-auto511 { file-mime "text/x-perl", 49 @@ -3432,11 +3470,12 @@ signature file-magic-auto512 { file-magic /(.*)(\x3c\x21[dD][oO][cC][tT][yY][pP][eE] {1,}[hH][tT][mM][lL])/ } +# This doesn't seem specific enough. # >0 regex,=^virtual[ \t\n]+ (len=14), ["C++ source text"], swap_endian=0 -signature file-magic-auto513 { - file-mime "text/x-c++", 49 - file-magic /(^virtual[ \x09\x0a]+)/ -} +#signature file-magic-auto513 { +# file-mime "text/x-c++", 49 +# file-magic /(.*)(virtual[ \x09\x0a]+)/ +#} # >0 search/1,=#! /usr/bin/env lua (len=19), ["Lua script text executable"], swap_endian=0 signature file-magic-auto514 { @@ -3455,13 +3494,6 @@ signature file-magic-auto516 { file-mime "text/x-tcl", 49 file-magic /(.*)(\x23\x21 \x2fusr\x2fbin\x2fenv tcl)/ } - -# >0 regex,=^[ \t]{0,50}\.globl (len=18), ["assembler source text"], swap_endian=0 -signature file-magic-auto517 { - file-mime "text/x-asm", 48 - file-magic /(^[ \x09]{0,50}\.globl)/ -} - # >0 search/1,=#!/usr/bin/env tcl (len=18), ["Tcl script text executable"], swap_endian=0 signature file-magic-auto518 { file-mime "text/x-tcl", 48 @@ -3489,37 +3521,7 @@ signature file-magic-auto521 { # >0 regex,=^class[ \t\n]+ (len=12), ["C++ source text"], swap_endian=0 signature file-magic-auto522 { file-mime "text/x-c++", 47 - file-magic /(^class[ \x09\x0a]+)/ -} - -# >0 regex,=^[ \t]{0,50}\.text (len=17), ["assembler source text"], swap_endian=0 -signature file-magic-auto523 { - file-mime "text/x-asm", 47 - file-magic /(^[ \x09]{0,50}\.text)/ -} - -# >0 regex,=^[ \t]{0,50}\.even (len=17), ["assembler source text"], swap_endian=0 -signature file-magic-auto524 { - file-mime "text/x-asm", 47 - file-magic /(^[ \x09]{0,50}\.even)/ -} - -# >0 regex,=^[ \t]{0,50}\.byte (len=17), ["assembler source text"], swap_endian=0 -signature file-magic-auto525 { - file-mime "text/x-asm", 47 - file-magic /(^[ \x09]{0,50}\.byte)/ -} - -# >0 regex,=^[ \t]{0,50}\.file (len=17), ["assembler source text"], swap_endian=0 -signature file-magic-auto526 { - file-mime "text/x-asm", 47 - file-magic /(^[ \x09]{0,50}\.file)/ -} - -# >0 regex,=^[ \t]{0,50}\.type (len=17), ["assembler source text"], swap_endian=0 -signature file-magic-auto527 { - file-mime "text/x-asm", 47 - file-magic /(^[ \x09]{0,50}\.type)/ + file-magic /(.*)(class[ \x09\x0a]+[[:alnum:]_]+)(.*)(\x7b)(.*)(public:)/ } # >0 search/1,=This is Info file (len=17), ["GNU Info text"], swap_endian=0 @@ -3717,11 +3719,12 @@ signature file-magic-auto553 { file-magic /(.*)(\x5cinput texinfo)/ } +# Not specific enough. # >0 regex,=^private: (len=9), ["C++ source text"], swap_endian=0 -signature file-magic-auto554 { - file-mime "text/x-c++", 44 - file-magic /(^private:)/ -} +#signature file-magic-auto554 { +# file-mime "text/x-c++", 44 +# file-magic /(.*)(private:)/ +#} # >0 search/4096,=def __init__ (len=12), [""], swap_endian=0 # >>&0 search/64,=self (len=4), ["Python script text executable"], swap_endian=0 @@ -3739,7 +3742,7 @@ signature file-magic-auto556 { # >0 regex,=^extern[ \t\n]+ (len=13), ["C source text"], swap_endian=0 signature file-magic-auto557 { file-mime "text/x-c", 43 - file-magic /(^extern[ \x09\x0a]+)/ + file-magic /(.*)(extern[ \x09\x0a]+)/ } # >0 search/4096,=% -*-latex-*- (len=13), ["LaTeX document text"], swap_endian=0 @@ -3748,16 +3751,17 @@ signature file-magic-auto558 { file-magic /(.*)(\x25 \x2d\x2a\x2dlatex\x2d\x2a\x2d)/ } +# Doesn't seem specific enough. # >0 regex,=^double[ \t\n]+ (len=13), ["C source text"], swap_endian=0 -signature file-magic-auto559 { - file-mime "text/x-c", 43 - file-magic /(^double[ \x09\x0a]+)/ -} +#signature file-magic-auto559 { +# file-mime "text/x-c", 43 +# file-magic /(^double[ \x09\x0a]+)/ +#} # >0 regex,=^struct[ \t\n]+ (len=13), ["C source text"], swap_endian=0 signature file-magic-auto560 { file-mime "text/x-c", 43 - file-magic /(^struct[ \x09\x0a]+)/ + file-magic /(.*)(struct[ \x09\x0a]+)/ } # >0 search/w/1,=#!/bin/nodejs (len=13), ["Node.js script text executable"], swap_endian=0 @@ -3766,11 +3770,12 @@ signature file-magic-auto561 { file-magic /(.*)(\x23\x21\x2fbin\x2fnodejs)/ } +# Not specific enough. # >0 regex,=^public: (len=8), ["C++ source text"], swap_endian=0 -signature file-magic-auto562 { - file-mime "text/x-c++", 43 - file-magic /(^public:)/ -} +#signature file-magic-auto562 { +# file-mime "text/x-c++", 43 +# file-magic /(.*)(public:)/ +#} # >0 search/wct/4096,=