maphash.go

     1  // Copyright 2019 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package maphash provides hash functions on byte sequences and comparable values.
     6  // It also defines [Hasher], the interface between a hash function and a hash table.
     7  //
     8  // These hash functions are intended to be used to implement hash
     9  // tables, Bloom filters, and other data structures that need to map
    10  // arbitrary strings or byte sequences to a uniform distribution on
    11  // unsigned 64-bit integers.
    12  //
    13  // Each different instance of a hash table or data structure should use its own [Seed].
    14  //
    15  // The hash functions are not cryptographically secure.
    16  // (See crypto/sha256 and crypto/sha512 for cryptographic use.)
    17  package maphash
    18  
    19  import (
    20  	"hash"
    21  	"internal/abi"
    22  	"internal/runtime/maps"
    23  	"unsafe"
    24  )
    25  
    26  // A Seed is a random value that selects the specific hash function
    27  // computed by a [Hash]. If two Hashes use the same Seeds, they
    28  // will compute the same hash values for any given input.
    29  // If two Hashes use different Seeds, they are very likely to compute
    30  // distinct hash values for any given input.
    31  //
    32  // A Seed must be initialized by calling [MakeSeed].
    33  // The zero seed is uninitialized and not valid for use with [Hash]'s SetSeed method.
    34  //
    35  // Each Seed value is local to a single process and cannot be serialized
    36  // or otherwise recreated in a different process.
    37  type Seed struct {
    38  	s uint64
    39  }
    40  
    41  // Bytes returns the hash of b with the given seed.
    42  //
    43  // Bytes is equivalent to, but more convenient and efficient than:
    44  //
    45  //	var h Hash
    46  //	h.SetSeed(seed)
    47  //	h.Write(b)
    48  //	return h.Sum64()
    49  func Bytes(seed Seed, b []byte) uint64 {
    50  	state := seed.s
    51  	if state == 0 {
    52  		panic("maphash: use of uninitialized Seed")
    53  	}
    54  
    55  	if len(b) > bufSize {
    56  		b = b[:len(b):len(b)] // merge len and cap calculations when reslicing
    57  		for len(b) > bufSize {
    58  			state = rthash(b[:bufSize], state)
    59  			b = b[bufSize:]
    60  		}
    61  	}
    62  	return rthash(b, state)
    63  }
    64  
    65  // String returns the hash of s with the given seed.
    66  //
    67  // String is equivalent to, but more convenient and efficient than:
    68  //
    69  //	var h Hash
    70  //	h.SetSeed(seed)
    71  //	h.WriteString(s)
    72  //	return h.Sum64()
    73  func String(seed Seed, s string) uint64 {
    74  	state := seed.s
    75  	if state == 0 {
    76  		panic("maphash: use of uninitialized Seed")
    77  	}
    78  	for len(s) > bufSize {
    79  		state = rthashString(s[:bufSize], state)
    80  		s = s[bufSize:]
    81  	}
    82  	return rthashString(s, state)
    83  }
    84  
    85  // A Hash computes a seeded hash of a byte sequence.
    86  //
    87  // The zero Hash is a valid Hash ready to use.
    88  // A zero Hash chooses a random seed for itself during
    89  // the first call to a Reset, Write, Seed, Clone, or Sum64 method.
    90  // For control over the seed, use SetSeed.
    91  //
    92  // The computed hash values depend only on the initial seed and
    93  // the sequence of bytes provided to the Hash object, not on the way
    94  // in which the bytes are provided. For example, the three sequences
    95  //
    96  //	h.Write([]byte{'f','o','o'})
    97  //	h.WriteByte('f'); h.WriteByte('o'); h.WriteByte('o')
    98  //	h.WriteString("foo")
    99  //
   100  // all have the same effect.
   101  //
   102  // Hashes are intended to be collision-resistant, even for situations
   103  // where an adversary controls the byte sequences being hashed.
   104  //
   105  // A Hash is not safe for concurrent use by multiple goroutines, but a Seed is.
   106  // If multiple goroutines must compute the same seeded hash,
   107  // each can declare its own Hash and call SetSeed with a common Seed.
   108  type Hash struct {
   109  	_     [0]func()     // not comparable
   110  	seed  Seed          // initial seed used for this hash
   111  	state Seed          // current hash of all flushed bytes
   112  	buf   [bufSize]byte // unflushed byte buffer
   113  	n     int           // number of unflushed bytes
   114  }
   115  
   116  // bufSize is the size of the Hash write buffer.
   117  // The buffer ensures that writes depend only on the sequence of bytes,
   118  // not the sequence of WriteByte/Write/WriteString calls,
   119  // by always calling rthash with a full buffer (except for the tail).
   120  const bufSize = 128
   121  
   122  // initSeed seeds the hash if necessary.
   123  // initSeed is called lazily before any operation that actually uses h.seed/h.state.
   124  // Note that this does not include Write/WriteByte/WriteString in the case
   125  // where they only add to h.buf. (If they write too much, they call h.flush,
   126  // which does call h.initSeed.)
   127  func (h *Hash) initSeed() {
   128  	if h.seed.s == 0 {
   129  		seed := MakeSeed()
   130  		h.seed = seed
   131  		h.state = seed
   132  	}
   133  }
   134  
   135  // WriteByte adds b to the sequence of bytes hashed by h.
   136  // It never fails; the error result is for implementing [io.ByteWriter].
   137  func (h *Hash) WriteByte(b byte) error {
   138  	if h.n == len(h.buf) {
   139  		h.flush()
   140  	}
   141  	h.buf[h.n] = b
   142  	h.n++
   143  	return nil
   144  }
   145  
   146  // Write adds b to the sequence of bytes hashed by h.
   147  // It always writes all of b and never fails; the count and error result are for implementing [io.Writer].
   148  func (h *Hash) Write(b []byte) (int, error) {
   149  	size := len(b)
   150  	// Deal with bytes left over in h.buf.
   151  	// h.n <= bufSize is always true.
   152  	// Checking it is ~free and it lets the compiler eliminate a bounds check.
   153  	if h.n > 0 && h.n <= bufSize {
   154  		k := copy(h.buf[h.n:], b)
   155  		h.n += k
   156  		if h.n < bufSize {
   157  			// Copied the entirety of b to h.buf.
   158  			return size, nil
   159  		}
   160  		b = b[k:]
   161  		h.flush()
   162  		// No need to set h.n = 0 here; it happens just before exit.
   163  	}
   164  	// Process as many full buffers as possible, without copying, and calling initSeed only once.
   165  	if len(b) > bufSize {
   166  		h.initSeed()
   167  		for len(b) > bufSize {
   168  			h.state.s = rthash(b[:bufSize], h.state.s)
   169  			b = b[bufSize:]
   170  		}
   171  	}
   172  	// Copy the tail.
   173  	copy(h.buf[:], b)
   174  	h.n = len(b)
   175  	return size, nil
   176  }
   177  
   178  // WriteString adds the bytes of s to the sequence of bytes hashed by h.
   179  // It always writes all of s and never fails; the count and error result are for implementing [io.StringWriter].
   180  func (h *Hash) WriteString(s string) (int, error) {
   181  	// WriteString mirrors Write. See Write for comments.
   182  	size := len(s)
   183  	if h.n > 0 && h.n <= bufSize {
   184  		k := copy(h.buf[h.n:], s)
   185  		h.n += k
   186  		if h.n < bufSize {
   187  			return size, nil
   188  		}
   189  		s = s[k:]
   190  		h.flush()
   191  	}
   192  	if len(s) > bufSize {
   193  		h.initSeed()
   194  		for len(s) > bufSize {
   195  			h.state.s = rthashString(s[:bufSize], h.state.s)
   196  			s = s[bufSize:]
   197  		}
   198  	}
   199  	copy(h.buf[:], s)
   200  	h.n = len(s)
   201  	return size, nil
   202  }
   203  
   204  // Seed returns h's seed value.
   205  func (h *Hash) Seed() Seed {
   206  	h.initSeed()
   207  	return h.seed
   208  }
   209  
   210  // SetSeed sets h to use seed, which must have been returned by [MakeSeed]
   211  // or by another [Hash.Seed] method.
   212  // Two [Hash] objects with the same seed behave identically.
   213  // Two [Hash] objects with different seeds will very likely behave differently.
   214  // Any bytes added to h before this call will be discarded.
   215  func (h *Hash) SetSeed(seed Seed) {
   216  	if seed.s == 0 {
   217  		panic("maphash: use of uninitialized Seed")
   218  	}
   219  	h.seed = seed
   220  	h.state = seed
   221  	h.n = 0
   222  }
   223  
   224  // Reset discards all bytes added to h.
   225  // (The seed remains the same.)
   226  func (h *Hash) Reset() {
   227  	h.initSeed()
   228  	h.state = h.seed
   229  	h.n = 0
   230  }
   231  
   232  // precondition: buffer is full.
   233  func (h *Hash) flush() {
   234  	if h.n != len(h.buf) {
   235  		panic("maphash: flush of partially full buffer")
   236  	}
   237  	h.initSeed()
   238  	h.state.s = rthash(h.buf[:h.n], h.state.s)
   239  	h.n = 0
   240  }
   241  
   242  // Sum64 returns h's current 64-bit value, which depends on
   243  // h's seed and the sequence of bytes added to h since the
   244  // last call to [Hash.Reset] or [Hash.SetSeed].
   245  //
   246  // All bits of the Sum64 result are close to uniformly and
   247  // independently distributed, so it can be safely reduced
   248  // by using bit masking, shifting, or modular arithmetic.
   249  func (h *Hash) Sum64() uint64 {
   250  	h.initSeed()
   251  	return rthash(h.buf[:h.n], h.state.s)
   252  }
   253  
   254  // MakeSeed returns a new random seed.
   255  func MakeSeed() Seed {
   256  	var s uint64
   257  	for {
   258  		s = randUint64()
   259  		// We use seed 0 to indicate an uninitialized seed/hash,
   260  		// so keep trying until we get a non-zero seed.
   261  		if s != 0 {
   262  			break
   263  		}
   264  	}
   265  	return Seed{s: s}
   266  }
   267  
   268  // Sum appends the hash's current 64-bit value to b.
   269  // It exists for implementing [hash.Hash].
   270  // For direct calls, it is more efficient to use [Hash.Sum64].
   271  func (h *Hash) Sum(b []byte) []byte {
   272  	x := h.Sum64()
   273  	return append(b,
   274  		byte(x>>0),
   275  		byte(x>>8),
   276  		byte(x>>16),
   277  		byte(x>>24),
   278  		byte(x>>32),
   279  		byte(x>>40),
   280  		byte(x>>48),
   281  		byte(x>>56))
   282  }
   283  
   284  // Size returns h's hash value size, 8 bytes.
   285  func (h *Hash) Size() int { return 8 }
   286  
   287  // BlockSize returns h's block size.
   288  func (h *Hash) BlockSize() int { return len(h.buf) }
   289  
   290  // Clone implements [hash.Cloner].
   291  func (h *Hash) Clone() (hash.Cloner, error) {
   292  	h.initSeed()
   293  	r := *h
   294  	return &r, nil
   295  }
   296  
   297  // Comparable returns the hash of comparable value v with the given seed
   298  // such that Comparable(s, v1) == Comparable(s, v2) if v1 == v2.
   299  // If v != v, then the resulting hash is randomly distributed.
   300  func Comparable[T comparable](seed Seed, v T) uint64 {
   301  	abi.EscapeNonString(v)
   302  	return comparableHash(v, seed)
   303  }
   304  
   305  // WriteComparable adds x to the data hashed by h.
   306  func WriteComparable[T comparable](h *Hash, x T) {
   307  	abi.EscapeNonString(x)
   308  	// writeComparable directly operates on h.state
   309  	// without using h.buf. Mix in the buffer length so it won't
   310  	// commute with a buffered write, which either changes h.n or changes
   311  	// h.state.
   312  	if h.n != 0 {
   313  		writeComparable(h, h.n)
   314  	}
   315  	writeComparable(h, x)
   316  }
   317  
   318  //go:linkname runtime_rand runtime.rand
   319  func runtime_rand() uint64
   320  
   321  //go:linkname runtime_memhash runtime.memhash
   322  //go:noescape
   323  func runtime_memhash(p unsafe.Pointer, seed, s uintptr) uintptr
   324  
   325  func rthash(buf []byte, seed uint64) uint64 {
   326  	if len(buf) == 0 {
   327  		return seed
   328  	}
   329  	len := len(buf)
   330  	// The runtime hasher only works on uintptr. For 64-bit
   331  	// architectures, we use the hasher directly. Otherwise,
   332  	// we use two parallel hashers on the lower and upper 32 bits.
   333  	if maps.Use64BitHash {
   334  		return uint64(runtime_memhash(unsafe.Pointer(&buf[0]), uintptr(seed), uintptr(len)))
   335  	}
   336  	lo := runtime_memhash(unsafe.Pointer(&buf[0]), uintptr(uint32(seed)), uintptr(len))
   337  	hi := runtime_memhash(unsafe.Pointer(&buf[0]), uintptr(seed>>32), uintptr(len))
   338  	return uint64(hi)<<32 | uint64(lo)
   339  }
   340  
   341  func rthashString(s string, state uint64) uint64 {
   342  	buf := unsafe.Slice(unsafe.StringData(s), len(s))
   343  	return rthash(buf, state)
   344  }
   345  
   346  func randUint64() uint64 {
   347  	return runtime_rand()
   348  }
   349  
   350  func comparableHash[T comparable](v T, seed Seed) uint64 {
   351  	s := seed.s
   352  	var m map[T]struct{}
   353  	mTyp := abi.TypeOf(m)
   354  	hasher := (*abi.MapType)(unsafe.Pointer(mTyp)).Hasher
   355  	if maps.Use64BitHash {
   356  		return uint64(hasher(abi.NoEscape(unsafe.Pointer(&v)), uintptr(s)))
   357  	}
   358  	lo := hasher(abi.NoEscape(unsafe.Pointer(&v)), uintptr(uint32(s)))
   359  	hi := hasher(abi.NoEscape(unsafe.Pointer(&v)), uintptr(s>>32))
   360  	return uint64(hi)<<32 | uint64(lo)
   361  }
   362  
   363  func writeComparable[T comparable](h *Hash, v T) {
   364  	h.state.s = comparableHash(v, h.state)
   365  }
   366
View as plain text