Skip to content

Commit

Permalink
新增katana 爬虫选择
Browse files Browse the repository at this point in the history
  • Loading branch information
yhy0 committed Mar 17, 2024
1 parent e36b830 commit 84cc10b
Show file tree
Hide file tree
Showing 4 changed files with 147 additions and 51 deletions.
File renamed without changes.
94 changes: 43 additions & 51 deletions crawler/crawlergo/filter/smart_filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import (
"sort"
"strings"
"sync"

mapset "github.com/deckarep/golang-set/v2"
)

Expand Down Expand Up @@ -95,9 +95,9 @@ func (s *SmartFilter) DoFilter(req *model.Request) bool {
// logging.Logger.Debugf("filter req by simplefilter: " + req.URL.RequestURI())
return true
}

req.Filter.FragmentID = s.calcFragmentID(req.URL.Fragment)

// 标记
if req.Method == config.GET || req.Method == config.DELETE || req.Method == config.HEAD || req.Method == config.OPTIONS {
s.getMark(req)
Expand All @@ -107,22 +107,22 @@ func (s *SmartFilter) DoFilter(req *model.Request) bool {
} else {
// logging.Logger.Debug("dont support such method: " + req.Method)
}

// 对标记后的请求进行去重
uniqueId := req.Filter.UniqueId
if s.uniqueMarkedIds.Contains(uniqueId) {
// logging.Logger.Debugf("filter req by uniqueMarkedIds 1: " + req.URL.RequestURI())
return true
}

// 全局数值型参数标记
s.globalFilterLocationMark(req)

// 接下来对标记的GET请求进行去重
if req.Method == config.GET || req.Method == config.DELETE || req.Method == config.HEAD || req.Method == config.OPTIONS {
// 对超过阈值的GET请求进行标记
s.overCountMark(req)

// 重新计算 QueryMapId
req.Filter.QueryMapId = getParamMapID(req.Filter.MarkedQueryMap)
// 重新计算 PathId
Expand All @@ -131,17 +131,17 @@ func (s *SmartFilter) DoFilter(req *model.Request) bool {
// 重新计算 PostDataId
req.Filter.PostDataId = getParamMapID(req.Filter.MarkedPostDataMap)
}

// 重新计算请求唯一ID
req.Filter.UniqueId = getMarkedUniqueID(req)

// 新的ID再次去重
newUniqueId := req.Filter.UniqueId
if s.uniqueMarkedIds.Contains(newUniqueId) {
// logging.Logger.Debugf("filter req by uniqueMarkedIds 2: " + req.URL.RequestURI())
return true
}

// 添加到结果集中
s.uniqueMarkedIds.Add(newUniqueId)
return false
Expand Down Expand Up @@ -170,13 +170,13 @@ func (s *SmartFilter) getMark(req *model.Request) {
// 首先是解码前的预先替换
todoURL := *(req.URL)
todoURL.RawQuery = s.preQueryMark(todoURL.RawQuery)

// 依次打标记
queryMap := todoURL.QueryMap()
queryMap = markParamName(queryMap)
queryMap = s.markParamValue(queryMap, *req)
markedPath := MarkPath(todoURL.Path)

// 计算唯一的ID
var queryKeyID string
var queryMapID string
Expand All @@ -188,13 +188,13 @@ func (s *SmartFilter) getMark(req *model.Request) {
queryMapID = ""
}
pathID := getPathID(markedPath)

req.Filter.MarkedQueryMap = queryMap
req.Filter.QueryKeysId = queryKeyID
req.Filter.QueryMapId = queryMapID
req.Filter.MarkedPath = markedPath
req.Filter.PathId = pathID

// 最后计算标记后的唯一请求ID
req.Filter.UniqueId = getMarkedUniqueID(req)
}
Expand All @@ -205,11 +205,11 @@ func (s *SmartFilter) getMark(req *model.Request) {
*/
func (s *SmartFilter) postMark(req *model.Request) {
postDataMap := req.PostDataMap()

postDataMap = markParamName(postDataMap)
postDataMap = s.markParamValue(postDataMap, *req)
markedPath := MarkPath(req.URL.Path)

// 计算唯一的ID
var postDataMapID string
if len(postDataMap) != 0 {
Expand All @@ -218,12 +218,12 @@ func (s *SmartFilter) postMark(req *model.Request) {
postDataMapID = ""
}
pathID := getPathID(markedPath)

req.Filter.MarkedPostDataMap = postDataMap
req.Filter.PostDataId = postDataMapID
req.Filter.MarkedPath = markedPath
req.Filter.PathId = pathID

// 最后计算标记后的唯一请求ID
req.Filter.UniqueId = getMarkedUniqueID(req)
}
Expand Down Expand Up @@ -415,44 +415,41 @@ func (s *SmartFilter) repeatCountStatistic(req *model.Request) {
} else {
s.filterParamKeyRepeatCount.Store(queryKeyId, 1)
}

for key, value := range req.Filter.MarkedQueryMap {
// 某个URL的所有参数名重复数量统计
paramQueryKey := queryKeyId + key

if set, ok := s.filterParamKeySingleValues.Load(paramQueryKey); ok {
set := set.(mapset.Set[interface{}])
set.Add(value)
set.(mapset.Set[interface{}]).Add(value)
} else {
s.filterParamKeySingleValues.Store(paramQueryKey, mapset.NewSet(value))
}

// 本轮所有URL中某个参数重复数量统计
if _, ok := s.filterParamKeyAllValues.Load(key); !ok {
s.filterParamKeyAllValues.Store(key, mapset.NewSet(value))
} else {
if v, ok := s.filterParamKeyAllValues.Load(key); ok {
set := v.(mapset.Set[interface{}])
if !set.Contains(value) {
set.Add(value)
if !v.(mapset.Set[interface{}]).Contains(value) {
v.(mapset.Set[interface{}]).Add(value)
}
}
}

// 如果参数值为空,统计该PATH下的空值参数名个数
if value == "" {
if _, ok := s.filterPathParamEmptyValues.Load(pathId); !ok {
s.filterPathParamEmptyValues.Store(pathId, mapset.NewSet(key))
} else {
if v, ok := s.filterPathParamEmptyValues.Load(pathId); ok {
set := v.(mapset.Set[string])
if !set.Contains(key) {
set.Add(key)
if !v.(mapset.Set[string]).Contains(key) {
v.(mapset.Set[string]).Add(key)
}
}
}
}

pathIdKey := pathId + key
// 某path下的参数值去重标记出现次数统计
if v, ok := s.filterPathParamKeySymbol.Load(pathIdKey); ok {
Expand All @@ -462,25 +459,24 @@ func (s *SmartFilter) repeatCountStatistic(req *model.Request) {
} else {
s.filterPathParamKeySymbol.Store(pathIdKey, 1)
}

}
}

// 相对于上一级目录,本级path目录的数量统计,存在文件后缀的情况下,放行常见脚本后缀
if req.URL.ParentPath() == "" || inCommonScriptSuffix(req.URL.FileExt()) {
return
}

//
parentPathId := tools.StrMd5(req.URL.ParentPath())
currentPath := strings.Replace(req.Filter.MarkedPath, req.URL.ParentPath(), "", -1)
if _, ok := s.filterParentPathValues.Load(parentPathId); !ok {
s.filterParentPathValues.Store(parentPathId, mapset.NewSet(currentPath))
} else {
if v, ok := s.filterParentPathValues.Load(parentPathId); ok {
set := v.(mapset.Set[string])
if !set.Contains(currentPath) {
set.Add(currentPath)
if !v.(mapset.Set[string]).Contains(currentPath) {
v.(mapset.Set[string]).Add(currentPath)
}
}
}
Expand All @@ -500,34 +496,31 @@ func (s *SmartFilter) overCountMark(req *model.Request) {
for key := range req.Filter.MarkedQueryMap {
paramQueryKey := queryKeyId + key
if set, ok := s.filterParamKeySingleValues.Load(paramQueryKey); ok {
set := set.(mapset.Set[string])
if set.Cardinality() > 3 {
if set.(mapset.Set[interface{}]).Cardinality() > 3 {
req.Filter.MarkedQueryMap[key] = FixParamRepeatMark
}
}
}
}

for key := range req.Filter.MarkedQueryMap {
// 所有URL中,某个参数不同的值出现次数超过阈值,打标记去重
if paramKeySet, ok := s.filterParamKeyAllValues.Load(key); ok {
_paramKeySet := paramKeySet.(mapset.Set[interface{}])
if _paramKeySet.Cardinality() > MaxParamKeyAllCount {
if paramKeySet.(mapset.Set[interface{}]).Cardinality() > MaxParamKeyAllCount {
req.Filter.MarkedQueryMap[key] = FixParamRepeatMark
}
}

pathIdKey := pathId + key
// 某个PATH的GET参数值去重标记出现次数超过阈值,则对该PATH的该参数进行全局标记
if v, ok := s.filterPathParamKeySymbol.Load(pathIdKey); ok && v.(int) > MaxPathParamKeySymbolCount {
req.Filter.MarkedQueryMap[key] = FixParamRepeatMark
}
}

// 处理某个path下空参数值的参数个数超过阈值 如伪静态: http://bang.360.cn/?chu_xiu
if v, ok := s.filterPathParamEmptyValues.Load(pathId); ok {
set := v.(mapset.Set[string])
if set.Cardinality() > MaxPathParamEmptyCount {
if v.(mapset.Set[string]).Cardinality() > MaxPathParamEmptyCount {
newMarkerQueryMap := map[string]interface{}{}
for key, value := range req.Filter.MarkedQueryMap {
if value == "" {
Expand All @@ -540,15 +533,14 @@ func (s *SmartFilter) overCountMark(req *model.Request) {
}
}
}

// 处理本级path的伪静态
if req.URL.ParentPath() == "" || inCommonScriptSuffix(req.URL.FileExt()) {
return
}
parentPathId := tools.StrMd5(req.URL.ParentPath())
if set, ok := s.filterParentPathValues.Load(parentPathId); ok {
set := set.(mapset.Set[string])
if set.Cardinality() > MaxParentPathCount {
if set.(mapset.Set[string]).Cardinality() > MaxParentPathCount {
if strings.HasSuffix(req.URL.ParentPath(), "/") {
req.Filter.MarkedPath = req.URL.ParentPath() + FixPathMark
} else {
Expand Down Expand Up @@ -586,15 +578,15 @@ func getMarkedUniqueID(req *model.Request) string {
} else {
paramId = req.Filter.PostDataId
}

uniqueStr := req.Method + paramId + req.Filter.PathId + req.URL.Host + req.Filter.FragmentID
if req.RedirectionFlag {
uniqueStr += "Redirection"
}
if req.URL.Path == "/" && req.URL.RawQuery == "" && req.URL.Scheme == "https" {
uniqueStr += "https"
}

return tools.StrMd5(uniqueStr)
}

Expand Down
78 changes: 78 additions & 0 deletions crawler/katana.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
package crawler

import (
"github.com/projectdiscovery/katana/pkg/engine"
"github.com/projectdiscovery/katana/pkg/engine/hybrid"
"github.com/projectdiscovery/katana/pkg/engine/standard"
"github.com/projectdiscovery/katana/pkg/output"
"github.com/projectdiscovery/katana/pkg/types"
"github.com/yhy0/Jie/conf"
"github.com/yhy0/logging"
"math"
)

/**
@author: yhy
@since: 2024/3/17
@desc: //TODO
**/

// 默认过滤的后缀名
var extensionFilter = []string{
".css", ".png", ".gif", ".jpg", ".mp4", ".mp3", ".mng", ".pct", ".bmp", ".jpeg", ".pst", ".psp", ".ttf",
".tif", ".tiff", ".ai", ".drw", ".wma", ".ogg", ".wav", ".ra", ".aac", ".mid", ".au", ".aiff",
".dxf", ".eps", ".ps", ".svg", ".3gp", ".asf", ".asx", ".avi", ".mov", ".mpg", ".qt", ".rm",
".wmv", ".m4a", ".bin", ".xls", ".xlsx", ".ppt", ".pptx", ".doc", ".docx", ".odt", ".ods", ".odg",
".odp", ".exe", ".zip", ".rar", ".tar", ".gz", ".iso", ".rss", ".pdf", ".dll", ".ico",
".gz2", ".apk", ".crt", ".woff", ".map", ".woff2", ".webp", ".less", ".dmg", ".bz2", ".otf", ".swf",
".flv", ".mpeg", ".dat", ".xsl", ".csv", ".cab", ".exif", ".wps", ".m4v", ".rmvb",
}

func Katana(target string, headless bool, show bool, out func(result output.Result)) {
// todo 作为库,还有 bug,这里有的参数根本不起作用,先自行处理
options := &types.Options{
MaxDepth: 3, // Maximum depth to crawl
FieldScope: "fqdn", // rdn: 爬取范围为根域名和所有子域(默认), dn:搜索范围为域名关键字 fqdn:爬取范围为给定子(域)
BodyReadSize: math.MaxInt, // Maximum response size to read
Timeout: 10, // Timeout is the time to wait for request in seconds
Concurrency: 10, // Concurrency is the number of concurrent crawling goroutines
Parallelism: 10, // Parallelism is the number of urls processing goroutines
Delay: 0, // Delay is the delay between each crawl requests in seconds
RateLimit: 150, // Maximum requests to send per second
Strategy: "depth-first", // Visit strategy (depth-first, breadth-first)
OnResult: out,
Headless: headless,
Proxy: conf.GlobalConfig.Http.Proxy,
ExtensionFilter: extensionFilter,
}
if options.Headless {
options.ShowBrowser = show
options.UseInstalledChrome = false
}

crawlerOptions, err := types.NewCrawlerOptions(options)
if err != nil {
logging.Logger.Fatal(err.Error())
}
defer crawlerOptions.Close()

var crawler engine.Engine

switch {
case options.Headless:
crawler, err = hybrid.New(crawlerOptions)
default:
crawler, err = standard.New(crawlerOptions)
}

if err != nil {
logging.Logger.Fatal("could not create standard crawler", err.Error())
}

defer crawler.Close()

err = crawler.Crawl(target)
if err != nil {
logging.Logger.Warnf("Could not crawl %s: %s", target, err.Error())
}
}
26 changes: 26 additions & 0 deletions crawler/katana_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package crawler

import (
"github.com/projectdiscovery/katana/pkg/output"
"github.com/yhy0/logging"
"testing"
)

/**
@author: yhy
@since: 2023/1/31
@desc: //TODO
**/

func TestKatana(t *testing.T) {
logging.Logger = logging.New(false, "", "1", true)

out := func(result output.Result) { // Callback function to execute for result
// if ValidatePath(result.Request.URL) {
// logging.Logger.Infoln(result.Request.URL)
// }
logging.Logger.Infoln(result.Request.URL)
}

Katana("https://www.baidu.com", true, true, out)
}

0 comments on commit 84cc10b

Please sign in to comment.