210 lines
5.7 KiB
Go
210 lines
5.7 KiB
Go
package misspell
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"io"
|
|
"io/ioutil"
|
|
"net/http"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
)
|
|
|
|
// The number of possible binary formats is very large
|
|
// items that might be checked into a repo or be an
|
|
// artifact of a build. Additions welcome.
|
|
//
|
|
// Golang's internal table is very small and can't be
|
|
// relied on. Even then things like ".js" have a mime
|
|
// type of "application/javascipt" which isn't very helpful.
|
|
// "[x]" means we have sniff test and suffix test should be eliminated
|
|
var binary = map[string]bool{
|
|
".a": true, // [ ] archive
|
|
".bin": true, // [ ] binary
|
|
".bz2": true, // [ ] compression
|
|
".class": true, // [x] Java class file
|
|
".dll": true, // [ ] shared library
|
|
".exe": true, // [ ] binary
|
|
".gif": true, // [ ] image
|
|
".gpg": true, // [x] text, but really all base64
|
|
".gz": true, // [ ] compression
|
|
".ico": true, // [ ] image
|
|
".jar": true, // [x] archive
|
|
".jpeg": true, // [ ] image
|
|
".jpg": true, // [ ] image
|
|
".mp3": true, // [ ] audio
|
|
".mp4": true, // [ ] video
|
|
".mpeg": true, // [ ] video
|
|
".o": true, // [ ] object file
|
|
".pdf": true, // [x] pdf
|
|
".png": true, // [x] image
|
|
".pyc": true, // [ ] Python bytecode
|
|
".pyo": true, // [ ] Python bytecode
|
|
".so": true, // [x] shared library
|
|
".swp": true, // [ ] vim swap file
|
|
".tar": true, // [ ] archive
|
|
".tiff": true, // [ ] image
|
|
".woff": true, // [ ] font
|
|
".woff2": true, // [ ] font
|
|
".xz": true, // [ ] compression
|
|
".z": true, // [ ] compression
|
|
".zip": true, // [x] archive
|
|
}
|
|
|
|
// isBinaryFilename returns true if the file is likely to be binary
|
|
//
|
|
// Better heuristics could be done here, in particular a binary
|
|
// file is unlikely to be UTF-8 encoded. However this is cheap
|
|
// and will solve the immediate need of making sure common
|
|
// binary formats are not corrupted by mistake.
|
|
func isBinaryFilename(s string) bool {
|
|
return binary[strings.ToLower(filepath.Ext(s))]
|
|
}
|
|
|
|
var scm = map[string]bool{
|
|
".bzr": true,
|
|
".git": true,
|
|
".hg": true,
|
|
".svn": true,
|
|
"CVS": true,
|
|
}
|
|
|
|
// isSCMPath returns true if the path is likely part of a (private) SCM
|
|
// directory. E.g. ./git/something = true
|
|
func isSCMPath(s string) bool {
|
|
// hack for .git/COMMIT_EDITMSG and .git/TAG_EDITMSG
|
|
// normally we don't look at anything in .git
|
|
// but COMMIT_EDITMSG and TAG_EDITMSG are used as
|
|
// temp files for git commits. Allowing misspell to inspect
|
|
// these files allows for commit-msg hooks
|
|
// https://git-scm.com/book/en/v2/Customizing-Git-Git-Hooks
|
|
if strings.Contains(filepath.Base(s), "EDITMSG") {
|
|
return false
|
|
}
|
|
parts := strings.Split(filepath.Clean(s), string(filepath.Separator))
|
|
for _, dir := range parts {
|
|
if scm[dir] {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
var magicHeaders = [][]byte{
|
|
// Issue #68
|
|
// PGP messages and signatures are "text" but really just
|
|
// blobs of base64-text and should not be misspell-checked
|
|
[]byte("-----BEGIN PGP MESSAGE-----"),
|
|
[]byte("-----BEGIN PGP SIGNATURE-----"),
|
|
|
|
// ELF
|
|
{0x7f, 0x45, 0x4c, 0x46},
|
|
|
|
// Postscript
|
|
{0x25, 0x21, 0x50, 0x53},
|
|
|
|
// PDF
|
|
{0x25, 0x50, 0x44, 0x46},
|
|
|
|
// Java class file
|
|
// https://en.wikipedia.org/wiki/Java_class_file
|
|
{0xCA, 0xFE, 0xBA, 0xBE},
|
|
|
|
// PNG
|
|
// https://en.wikipedia.org/wiki/Portable_Network_Graphics
|
|
{0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a},
|
|
|
|
// ZIP, JAR, ODF, OOXML
|
|
{0x50, 0x4B, 0x03, 0x04},
|
|
{0x50, 0x4B, 0x05, 0x06},
|
|
{0x50, 0x4B, 0x07, 0x08},
|
|
}
|
|
|
|
func isTextFile(raw []byte) bool {
|
|
for _, magic := range magicHeaders {
|
|
if bytes.HasPrefix(raw, magic) {
|
|
return false
|
|
}
|
|
}
|
|
|
|
// allow any text/ type with utf-8 encoding
|
|
// DetectContentType sometimes returns charset=utf-16 for XML stuff
|
|
// in which case ignore.
|
|
mime := http.DetectContentType(raw)
|
|
return strings.HasPrefix(mime, "text/") && strings.HasSuffix(mime, "charset=utf-8")
|
|
}
|
|
|
|
// ReadTextFile returns the contents of a file, first testing if it is a text file
|
|
// returns ("", nil) if not a text file
|
|
// returns ("", error) if error
|
|
// returns (string, nil) if text
|
|
//
|
|
// unfortunately, in worse case, this does
|
|
// 1 stat
|
|
// 1 open,read,close of 512 bytes
|
|
// 1 more stat,open, read everything, close (via ioutil.ReadAll)
|
|
// This could be kinder to the filesystem.
|
|
//
|
|
// This uses some heuristics of the file's extension (e.g. .zip, .txt) and
|
|
// uses a sniffer to determine if the file is text or not.
|
|
// Using file extensions isn't great, but probably
|
|
// good enough for real-world use.
|
|
// Golang's built in sniffer is problematic for differnet reasons. It's
|
|
// optimized for HTML, and is very limited in detection. It would be good
|
|
// to explicitly add some tests for ELF/DWARF formats to make sure we never
|
|
// corrupt binary files.
|
|
func ReadTextFile(filename string) (string, error) {
|
|
if isBinaryFilename(filename) {
|
|
return "", nil
|
|
}
|
|
|
|
if isSCMPath(filename) {
|
|
return "", nil
|
|
}
|
|
|
|
fstat, err := os.Stat(filename)
|
|
|
|
if err != nil {
|
|
return "", fmt.Errorf("Unable to stat %q: %s", filename, err)
|
|
}
|
|
|
|
// directory: nothing to do.
|
|
if fstat.IsDir() {
|
|
return "", nil
|
|
}
|
|
|
|
// avoid reading in multi-gig files
|
|
// if input is large, read the first 512 bytes to sniff type
|
|
// if not-text, then exit
|
|
isText := false
|
|
if fstat.Size() > 50000 {
|
|
fin, err := os.Open(filename)
|
|
if err != nil {
|
|
return "", fmt.Errorf("Unable to open large file %q: %s", filename, err)
|
|
}
|
|
defer fin.Close()
|
|
buf := make([]byte, 512)
|
|
_, err = io.ReadFull(fin, buf)
|
|
if err != nil {
|
|
return "", fmt.Errorf("Unable to read 512 bytes from %q: %s", filename, err)
|
|
}
|
|
if !isTextFile(buf) {
|
|
return "", nil
|
|
}
|
|
|
|
// set so we don't double check this file
|
|
isText = true
|
|
}
|
|
|
|
// read in whole file
|
|
raw, err := ioutil.ReadFile(filename)
|
|
if err != nil {
|
|
return "", fmt.Errorf("Unable to read all %q: %s", filename, err)
|
|
}
|
|
|
|
if !isText && !isTextFile(raw) {
|
|
return "", nil
|
|
}
|
|
return string(raw), nil
|
|
}
|