Parcourir la source

Add support for gzip-compressed raw databases

Gwyneth Llewelyn il y a 2 ans
Parent
commit
9e2b5d7fd7
3 fichiers modifiés avec 31 ajouts et 5 suppressions
  1. 1 0
      go.mod
  2. 2 0
      go.sum
  3. 28 5
      gosl.go

+ 1 - 0
go.mod

@@ -5,6 +5,7 @@ go 1.17
 require (
 	github.com/dgraph-io/badger v1.6.2
 	github.com/dgraph-io/badger/v3 v3.2011.1
+	github.com/h2non/filetype v1.1.1
 	github.com/op/go-logging v0.0.0-20160315200505-970db520ece7
 	github.com/spf13/pflag v1.0.5
 	github.com/spf13/viper v1.7.1

+ 2 - 0
go.sum

@@ -119,6 +119,8 @@ github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/ad
 github.com/grpc-ecosystem/go-grpc-middleware v1.0.0/go.mod h1:FiyG127CGDf3tlThmgyCl78X/SZQqEOJBCDaAfeWzPs=
 github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk=
 github.com/grpc-ecosystem/grpc-gateway v1.9.0/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY=
+github.com/h2non/filetype v1.1.1 h1:xvOwnXKAckvtLWsN398qS9QhlxlnVXBjXBydK2/UFB4=
+github.com/h2non/filetype v1.1.1/go.mod h1:319b3zT68BvV+WRj7cwy856M2ehB3HqNOt6sy1HndBY=
 github.com/hashicorp/consul/api v1.1.0/go.mod h1:VmuI/Lkw1nC05EYQWNKwWGbkg+FbDBtguAZLlVdkD9Q=
 github.com/hashicorp/consul/sdk v0.1.1/go.mod h1:VKf9jXwCTEY1QZP2MOLRhb5i/I/ssyNV1vwHyQBF0x8=
 github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=

+ 28 - 5
gosl.go

@@ -4,6 +4,7 @@ package main
 import (
 	"bufio"
 	"compress/bzip2"
+	"compress/gzip"
 	"encoding/csv"
 	"encoding/json"
 	"fmt"
@@ -21,6 +22,7 @@ import (
 //	"github.com/dgraph-io/badger/options"
 //	"github.com/fsnotify/fsnotify"
 	"github.com/h2non/filetype"
+	"github.com/h2non/filetype/matchers"
 	"github.com/op/go-logging"
 	flag "github.com/spf13/pflag"
 	"github.com/spf13/viper"
@@ -558,11 +560,32 @@ func importDatabase(filename string) {
 	defer filehandler.Close()
 
 	// First, check if we _do_ have a gzipped file or not...
-
-
-
-	gr := bzip2.NewReader(filehandler) // open bzip2 reader
-	cr := csv.NewReader(gr)  // open csv reader and feed the bzip2 reader into it
+	// We'll use a small library for that (gwyneth 20211027)
+
+	// We only have to pass the file header = first 261 bytes
+	head := make([]byte, 261)
+	_, err = filehandler.Read(head)
+	checkErr(err)
+
+	kind, err := filetype.Match(head)
+	checkErr(err)
+
+	var cr *csv.Reader	// CSV reader needs to be declared here because of scope issues. (gwyneth 20211027)
+
+	// Technically, we could match for a lot of archives and get a io.Reader for each.
+	// However, W-Hat has a limited selection of archives available (currently gzip and bzip2)
+	// so we limit ourselves to these two, falling back to plaintext (gwyneth 20211027).
+	switch kind {
+		case matchers.TypeBz2:
+			gr := bzip2.NewReader(filehandler) // open bzip2 reader
+			cr = csv.NewReader(gr)  // open csv reader and feed the bzip2 reader into it
+		case matchers.TypeGz:
+			zr := gzip.NewReader(filehandler) // open gzip reader
+			cr = csv.NewReader(zr)  // open csv reader and feed the bzip2 reader into it
+		default:
+			// We just assume that it's a CSV (uncompressed) file and open it.
+			cr = csv.NewReader(filehandler)
+	}
 
 	limit := 0	// outside of for loop so that we can count how many entries we had in total
 	time_start := time.Now() // we want to get an idea on how long this takes