GH-340: Improve IPv4/IPv6 regexes, extraction, and validity functions

* is_valid_ip() is now implemented as a BIF instead of in
  base/utils/addrs

* The IPv4 and IPv6 regular expressions provided by base/utils/addrs
  have been improved/corrected (previously they could possibly match
  some invalid IPv4 decimals, or various "zero compressed" IPv6 strings
  with too many hextets)

* extract_ip_addresses() should give better results as a result of
  the above two points
This commit is contained in:
Jon Siwek 2019-04-18 19:04:39 -07:00
parent 9421ee0293
commit 7144661930
8 changed files with 200 additions and 106 deletions

2
doc

@ -1 +1 @@
Subproject commit 9b556e5e71d0d8a5c2e7a1d4be4b308d887310f1 Subproject commit 34e9f9add97e67c9768540433cdccf221b592a4e

View file

@ -1,31 +1,67 @@
##! Functions for parsing and manipulating IP and MAC addresses. ##! Functions for parsing and manipulating IP and MAC addresses.
# Regular expressions for matching IP addresses in strings. # Regular expressions for matching IP addresses in strings.
const ipv4_addr_regex = /[[:digit:]]{1,3}\.[[:digit:]]{1,3}\.[[:digit:]]{1,3}\.[[:digit:]]{1,3}/;
const ipv6_8hex_regex = /([0-9A-Fa-f]{1,4}:){7}[0-9A-Fa-f]{1,4}/;
const ipv6_compressed_hex_regex = /(([0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4})*)?)::(([0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4})*)?)/;
const ipv6_hex4dec_regex = /(([0-9A-Fa-f]{1,4}:){6,6})([0-9]+)\.([0-9]+)\.([0-9]+)\.([0-9]+)/;
const ipv6_compressed_hex4dec_regex = /(([0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4})*)?)::(([0-9A-Fa-f]{1,4}:)*)([0-9]+)\.([0-9]+)\.([0-9]+)\.([0-9]+)/;
# These are commented out until patterns can be constructed this way at init time. const ipv4_decim = /[0-9]{1}|[0-9]{2}|0[0-9]{2}|1[0-9]{2}|2[0-4][0-9]|25[0-5]/;
#const ipv6_addr_regex = ipv6_8hex_regex |
# ipv6_compressed_hex_regex |
# ipv6_hex4dec_regex |
# ipv6_compressed_hex4dec_regex;
#const ip_addr_regex = ipv4_addr_regex | ipv6_addr_regex;
const ipv6_addr_regex = const ipv4_addr_regex = ipv4_decim & /\./ & ipv4_decim & /\./ & ipv4_decim & /\./ & ipv4_decim;
/([0-9A-Fa-f]{1,4}:){7}[0-9A-Fa-f]{1,4}/ |
/(([0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4})*)?)::(([0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4})*)?)/ | # IPv6 Compressed Hex
/(([0-9A-Fa-f]{1,4}:){6,6})([0-9]+)\.([0-9]+)\.([0-9]+)\.([0-9]+)/ | # 6Hex4Dec
/(([0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4})*)?)::(([0-9A-Fa-f]{1,4}:)*)([0-9]+)\.([0-9]+)\.([0-9]+)\.([0-9]+)/; # CompressedHex4Dec
const ip_addr_regex = const ipv6_hextet = /[0-9A-Fa-f]{1,4}/;
/[[:digit:]]{1,3}\.[[:digit:]]{1,3}\.[[:digit:]]{1,3}\.[[:digit:]]{1,3}/ |
/([0-9A-Fa-f]{1,4}:){7}[0-9A-Fa-f]{1,4}/ | const ipv6_8hex_regex = /([0-9A-Fa-f]{1,4}:){7}/ & ipv6_hextet;
/(([0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4})*)?)::(([0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4})*)?)/ | # IPv6 Compressed Hex
/(([0-9A-Fa-f]{1,4}:){6,6})([0-9]+)\.([0-9]+)\.([0-9]+)\.([0-9]+)/ | # 6Hex4Dec const ipv6_hex4dec_regex = /([0-9A-Fa-f]{1,4}:){6}/ & ipv4_addr_regex;
/(([0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4})*)?)::(([0-9A-Fa-f]{1,4}:)*)([0-9]+)\.([0-9]+)\.([0-9]+)\.([0-9]+)/; # CompressedHex4Dec
const ipv6_compressed_lead_hextets0 = /::([0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4}){0,6})?/;
const ipv6_compressed_lead_hextets1 = /[0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4}){0}::([0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4}){0,5})?/;
const ipv6_compressed_lead_hextets2 = /[0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4}){1}::([0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4}){0,4})?/;
const ipv6_compressed_lead_hextets3 = /[0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4}){2}::([0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4}){0,3})?/;
const ipv6_compressed_lead_hextets4 = /[0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4}){3}::([0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4}){0,2})?/;
const ipv6_compressed_lead_hextets5 = /[0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4}){4}::([0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4}){0,1})?/;
const ipv6_compressed_lead_hextets6 = /[0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4}){5}::([0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4}){0,0})?/;
const ipv6_compressed_lead_hextets7 = /[0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4}){6}::/;
const ipv6_compressed_hex_regex = ipv6_compressed_lead_hextets0 |
ipv6_compressed_lead_hextets1 |
ipv6_compressed_lead_hextets2 |
ipv6_compressed_lead_hextets3 |
ipv6_compressed_lead_hextets4 |
ipv6_compressed_lead_hextets5 |
ipv6_compressed_lead_hextets6 |
ipv6_compressed_lead_hextets7;
const ipv6_compressed_hext4dec_lead_hextets0 = /::([0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4}){0,4})?/ & ipv4_addr_regex;
const ipv6_compressed_hext4dec_lead_hextets1 = /[0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4}){0}::([0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4}){0,3})?/ & ipv4_addr_regex;
const ipv6_compressed_hext4dec_lead_hextets2 = /[0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4}){1}::([0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4}){0,2})?/ & ipv4_addr_regex;
const ipv6_compressed_hext4dec_lead_hextets3 = /[0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4}){2}::([0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4}){0,1})?/ & ipv4_addr_regex;
const ipv6_compressed_hext4dec_lead_hextets4 = /[0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4}){3}::([0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4}){0,0})?/ & ipv4_addr_regex;
const ipv6_compressed_hext4dec_lead_hextets5 = /[0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4}){4}::/ & ipv4_addr_regex;
const ipv6_compressed_hex4dec_regex = ipv6_compressed_hext4dec_lead_hextets0 |
ipv6_compressed_hext4dec_lead_hextets1 |
ipv6_compressed_hext4dec_lead_hextets2 |
ipv6_compressed_hext4dec_lead_hextets3 |
ipv6_compressed_hext4dec_lead_hextets4 |
ipv6_compressed_hext4dec_lead_hextets5;
const ipv6_addr_regex = ipv6_8hex_regex |
ipv6_compressed_hex_regex |
ipv6_hex4dec_regex |
ipv6_compressed_hex4dec_regex;
const ip_addr_regex = ipv4_addr_regex | ipv6_addr_regex;
## Checks if all elements of a string array are a valid octet value. ## Checks if all elements of a string array are a valid octet value.
## ##
@ -44,49 +80,6 @@ function has_valid_octets(octets: string_vec): bool
return T; return T;
} }
## Checks if a string appears to be a valid IPv4 or IPv6 address.
##
## ip_str: the string to check for valid IP formatting.
##
## Returns: T if the string is a valid IPv4 or IPv6 address format.
function is_valid_ip(ip_str: string): bool
{
local octets: string_vec;
if ( ip_str == ipv4_addr_regex )
{
octets = split_string(ip_str, /\./);
if ( |octets| != 4 )
return F;
return has_valid_octets(octets);
}
else if ( ip_str == ipv6_addr_regex )
{
if ( ip_str == ipv6_hex4dec_regex ||
ip_str == ipv6_compressed_hex4dec_regex )
{
# the regexes for hybrid IPv6-IPv4 address formats don't for valid
# octets within the IPv4 part, so do that now
octets = split_string(ip_str, /\./);
if ( |octets| != 4 )
return F;
# get rid of remaining IPv6 stuff in first octet
local tmp = split_string(octets[0], /:/);
octets[0] = tmp[|tmp| - 1];
return has_valid_octets(octets);
}
else
{
# pure IPv6 address formats that only use hex digits don't need
# any additional checks -- the regexes should be complete
return T;
}
}
return F;
}
## Extracts all IP (v4 or v6) address strings from a given string. ## Extracts all IP (v4 or v6) address strings from a given string.
## ##
## input: a string that may contain an IP address anywhere within it. ## input: a string that may contain an IP address anywhere within it.

View file

@ -101,38 +101,44 @@ void IPAddr::ReverseMask(int top_bits_to_chop)
p[i] &= mask_bits[i]; p[i] &= mask_bits[i];
} }
void IPAddr::Init(const std::string& s) bool IPAddr::ConvertString(const char* s, in6_addr* result)
{ {
if ( s.find(':') == std::string::npos ) // IPv4. for ( auto p = s; *p; ++p )
if ( *p == ':' )
// IPv6
return (inet_pton(AF_INET6, s, result->s6_addr) == 1);
// IPv4
// Parse the address directly instead of using inet_pton since
// some platforms have more sensitive implementations than others
// that can't e.g. handle leading zeroes.
int a[4];
int n = 0;
int match_count = sscanf(s, "%d.%d.%d.%d%n", a+0, a+1, a+2, a+3, &n);
if ( match_count != 4 )
return false;
if ( s[n] != '\0' )
return false;
for ( auto i = 0; i < 4; ++i )
if ( a[i] < 0 || a[i] > 255 )
return false;
uint32_t addr = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3];
addr = htonl(addr);
memcpy(result->s6_addr, v4_mapped_prefix, sizeof(v4_mapped_prefix));
memcpy(&result->s6_addr[12], &addr, sizeof(uint32_t));
return true;
}
void IPAddr::Init(const char* s)
{
if ( ! ConvertString(s, &in6) )
{ {
memcpy(in6.s6_addr, v4_mapped_prefix, sizeof(v4_mapped_prefix)); reporter->Error("Bad IP address: %s", s);
memset(in6.s6_addr, 0, sizeof(in6.s6_addr));
// Parse the address directly instead of using inet_pton since
// some platforms have more sensitive implementations than others
// that can't e.g. handle leading zeroes.
int a[4];
int n = sscanf(s.c_str(), "%d.%d.%d.%d", a+0, a+1, a+2, a+3);
if ( n != 4 || a[0] < 0 || a[1] < 0 || a[2] < 0 || a[3] < 0 ||
a[0] > 255 || a[1] > 255 || a[2] > 255 || a[3] > 255 )
{
reporter->Error("Bad IP address: %s", s.c_str());
memset(in6.s6_addr, 0, sizeof(in6.s6_addr));
return;
}
uint32_t addr = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3];
addr = htonl(addr);
memcpy(&in6.s6_addr[12], &addr, sizeof(uint32_t));
}
else
{
if ( inet_pton(AF_INET6, s.c_str(), in6.s6_addr) <=0 )
{
reporter->Error("Bad IP address: %s", s.c_str());
memset(in6.s6_addr, 0, sizeof(in6.s6_addr));
}
} }
} }

View file

@ -68,7 +68,7 @@ public:
*/ */
IPAddr(const std::string& s) IPAddr(const std::string& s)
{ {
Init(s); Init(s.data());
} }
/** /**
@ -366,6 +366,29 @@ public:
unsigned int MemoryAllocation() const { return padded_sizeof(*this); } unsigned int MemoryAllocation() const { return padded_sizeof(*this); }
/**
* Converts an IPv4 or IPv6 string into a network address structure
* (IPv6 or v4-to-v6-mapping in network bytes order).
*
* @param s the IPv4 or IPv6 string to convert (ASCII, NUL-terminated).
*
* @param result buffer that the caller supplies to store the result.
*
* @return whether the conversion was successful.
*/
static bool ConvertString(const char* s, in6_addr* result);
/**
* @param s the IPv4 or IPv6 string to convert (ASCII, NUL-terminated).
*
* @return whether the string is a valid IP address
*/
static bool IsValid(const char* s)
{
in6_addr tmp;
return ConvertString(s, &tmp);
}
private: private:
friend class IPPrefix; friend class IPPrefix;
@ -373,9 +396,9 @@ private:
* Initializes an address instance from a string representation. * Initializes an address instance from a string representation.
* *
* @param s String containing an IP address as either a dotted IPv4 * @param s String containing an IP address as either a dotted IPv4
* address or a hex IPv6 address. * address or a hex IPv6 address (ASCII, NUL-terminated).
*/ */
void Init(const std::string& s); void Init(const char* s);
in6_addr in6; // IPv6 or v4-to-v6-mapped address in6_addr in6; // IPv6 or v4-to-v6-mapped address

View file

@ -2409,6 +2409,19 @@ function to_addr%(ip: string%): addr
return ret; return ret;
%} %}
## Checks if a string is a valid IPv4 or IPv6 address.
##
## ip: the string to check for valid IP formatting.
##
## Returns: T if the string is a valid IPv4 or IPv6 address format.
function is_valid_ip%(ip: string%): bool
%{
char* s = ip->AsString()->Render();
auto rval = IPAddr::IsValid(s);
delete [] s;
return val_mgr->GetBool(rval);
%}
## Converts a :bro:type:`string` to a :bro:type:`subnet`. ## Converts a :bro:type:`string` to a :bro:type:`subnet`.
## ##
## sn: The subnet to convert. ## sn: The subnet to convert.

View file

@ -53,11 +53,13 @@
#include "iosource/Manager.h" #include "iosource/Manager.h"
/** /**
* Return IP address without enclosing brackets and any leading 0x. * Return IP address without enclosing brackets and any leading 0x. Also
* trims leading/trailing whitespace.
*/ */
std::string extract_ip(const std::string& i) std::string extract_ip(const std::string& i)
{ {
std::string s(skip_whitespace(i.c_str())); std::string s(strstrip(i));
if ( s.size() > 0 && s[0] == '[' ) if ( s.size() > 0 && s[0] == '[' )
s.erase(0, 1); s.erase(0, 1);

View file

@ -1,4 +1,4 @@
============ test ipv4 regex ============ test ipv4 regex (good strings)
T T
T T
T T
@ -6,9 +6,24 @@ T
T T
T T
T T
T
T
T
T
T
T
T
============ bad ipv4 decimals
F F
F F
F F
F
F
F
============ too many ipv4 decimals
F
F
============ typical looking ipv4
T T
T T
============ test ipv6 regex ============ test ipv6 regex
@ -30,6 +45,9 @@ T
F F
F F
F F
F
F
============ test extract_ip_addresses() ============ test extract_ip_addresses()
[1.1.1.1, 2.2.2.2, 3.3.3.3] [1.1.1.1, 2.2.2.2, 3.3.3.3]
[1.1.1.1, 0:0:0:0:0:0:0:0, 3.3.3.3] [1.1.1.1, 0:0:0:0:0:0:0:0, 3.3.3.3]
[6:1:2::3:4:5:6]

View file

@ -5,23 +5,54 @@
event bro_init() event bro_init()
{ {
print "============ test ipv4 regex (good strings)";
local ip = "0.0.0.0"; local ip = "0.0.0.0";
print "============ test ipv4 regex";
print ip == ipv4_addr_regex; print ip == ipv4_addr_regex;
print is_valid_ip(ip); print is_valid_ip(ip);
ip = "1.1.1.1"; ip = "1.1.1.1";
print ip == ipv4_addr_regex; print ip == ipv4_addr_regex;
print is_valid_ip(ip); print is_valid_ip(ip);
ip = "9.9.9.9";
print ip == ipv4_addr_regex;
print is_valid_ip(ip);
ip = "99.99.99.99";
print ip == ipv4_addr_regex;
print is_valid_ip(ip);
ip = "09.99.99.99";
print ip == ipv4_addr_regex;
print is_valid_ip(ip);
ip = "009.99.99.99";
print ip == ipv4_addr_regex;
print is_valid_ip(ip);
ip = "255.255.255.255"; ip = "255.255.255.255";
print ip == ipv4_addr_regex; print ip == ipv4_addr_regex;
print is_valid_ip(ip); print is_valid_ip(ip);
print "============ bad ipv4 decimals";
ip = "255.255.255.256"; ip = "255.255.255.256";
print ip == ipv4_addr_regex; # the regex doesn't check for 0-255 print ip == ipv4_addr_regex;
print is_valid_ip(ip); # but is_valid_ip() will print is_valid_ip(ip);
ip = "255.255.255.295";
print ip == ipv4_addr_regex;
print is_valid_ip(ip);
ip = "255.255.255.300";
print ip == ipv4_addr_regex;
print is_valid_ip(ip);
print "============ too many ipv4 decimals";
ip = "255.255.255.255.255"; ip = "255.255.255.255.255";
print ip == ipv4_addr_regex; print ip == ipv4_addr_regex;
print is_valid_ip(ip); print is_valid_ip(ip);
print "============ typical looking ipv4";
ip = "192.168.1.100"; ip = "192.168.1.100";
print ip == ipv4_addr_regex; print ip == ipv4_addr_regex;
print is_valid_ip(ip); print is_valid_ip(ip);
@ -97,8 +128,16 @@ event bro_init()
ip = "2001:db8:0:0:0:FFFF:192.168.0.256"; ip = "2001:db8:0:0:0:FFFF:192.168.0.256";
print is_valid_ip(ip); print is_valid_ip(ip);
# These have too many hextets ("::" must expand to at least one hextet)
print is_valid_ip("6:1:2::3:4:5:6:7");
print is_valid_ip("6:1:2::3:4:5:6:7:8");
print "============ test extract_ip_addresses()"; print "============ test extract_ip_addresses()";
print extract_ip_addresses("this is 1.1.1.1 a test 2.2.2.2 string with ip addresses 3.3.3.3"); print extract_ip_addresses("this is 1.1.1.1 a test 2.2.2.2 string with ip addresses 3.3.3.3");
print extract_ip_addresses("this is 1.1.1.1 a test 0:0:0:0:0:0:0:0 string with ip addresses 3.3.3.3"); print extract_ip_addresses("this is 1.1.1.1 a test 0:0:0:0:0:0:0:0 string with ip addresses 3.3.3.3");
# This will use the leading 6 from "IPv6" (maybe that's not intended
# by a person trying to parse such a string, but that's just what's going
# to happen; it's on them to deal).
print extract_ip_addresses("IPv6:1:2::3:4:5:6:7");
} }