Module pywander.crawler.utils

Functions

def check_url_type(url)
Expand source code
def check_url_type(url):
    """
    这里只是对URL类型进行判断,从网络下或的HTML文件需要分辨各种URL类型并采取相应的策略
    """
    p = urlsplit(url)
    if p.scheme and p.netloc and p.path:
        return URLType.Absolute

    if not p.scheme and p.netloc and p.path:
        return URLType.MissScheme

    if not p.scheme and not p.netloc and p.path:
        if p.path.startswith('/'):
            return URLType.RelativeSite
        else:
            return URLType.RelativeFolder
    if not p.scheme and not p.netloc and not p.path:
        if p.fragment:
            return URLType.RelativeArticle
        else:
            return URLType.InValid

这里只是对URL类型进行判断,从网络下或的HTML文件需要分辨各种URL类型并采取相应的策略

def download(url, filename, download_timeout=30, override=False, **kwargs)
Expand source code
def download(url, filename, download_timeout=30, override=False, **kwargs):
    """
    将目标url先下载到临时文件,然后再保存到命名文件.

    :param url: the url
    :param filename: 指定文件名
    """
    headers = {
        'user-agent': ua.random()
    }

    logger.info(f'start downloading file {url} to {filename}')
    start = time.time()

    filename = to_absolute_path(filename)

    # make sure folder exists
    mkdirs(os.path.dirname(filename))

    if os.path.exists(filename):
        if override:
            logger.info(f'{filename} exist. but i will override it.')
        else:
            logger.info(f'{filename} exist.')
            return

    content = requests.get(url, stream=True, headers=headers, **kwargs)

    with open(filename, 'wb') as f:
        for chunk in content.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
            if time.time() - start > download_timeout:
                content.close()
                os.unlink(filename)
                logger.warning('{0} download failed'.format(filename))
                return False

    return filename

将目标url先下载到临时文件,然后再保存到命名文件.

:param url: the url :param filename: 指定文件名

def get_download_filename(url)
Expand source code
def get_download_filename(url):
    """
    从下载url中获得文件名, 不一定是有意义的.
    """
    path = get_url_path(url)
    filename = os.path.basename(path)
    return filename

从下载url中获得文件名, 不一定是有意义的.

def get_url_fragment(url)
Expand source code
def get_url_fragment(url):
    """
    please notice the fragment not include the symbol #
    """
    p = urlsplit(url)
    return p.fragment

please notice the fragment not include the symbol #

def get_url_netloc(url)
Expand source code
def get_url_netloc(url):
    """
    获取url的netloc属性
    """
    p = urlsplit(url)
    return p.netloc

获取url的netloc属性

def get_url_path(url)
Expand source code
def get_url_path(url):
    """
    获取url的path属性
    """
    p = urlsplit(url)
    return p.path

获取url的path属性

def is_url_belong(url, baseurl)
Expand source code
def is_url_belong(url, baseurl):
    """
    is the url belong to the baseurl.
    the check logic is strict string match.
    """
    if url.startswith(baseurl):
        return True
    else:
        return False

is the url belong to the baseurl. the check logic is strict string match.

def is_url_in_article(url)
Expand source code
def is_url_in_article(url):
    """

    """
    p = urlsplit(url)
    if p.fragment:
        return True
    else:
        return False
def is_url_in_site(url, ref_url)
Expand source code
def is_url_in_site(url, ref_url):
    """
    is the url in site.
    the judgement is based on the refUrl's netloc.

>>> is_url_in_site('https://code.visualstudio.com/docs', \
    'https://code.visualstudio.com/docs/python/linting')
True
    """
    p = urlsplit(url)
    if p.netloc == urlsplit(ref_url).netloc:
        return True
    else:
        return False

is the url in site. the judgement is based on the refUrl's netloc.

>>> is_url_in_site('https://code.visualstudio.com/docs',     'https://code.visualstudio.com/docs/python/linting')
True
def remove_url_fragment(url)
Expand source code
def remove_url_fragment(url):
    """
    remove url fragment like `#sec1` and the parameters on url will
    keeped still.
    """
    defragmented, frag = urldefrag(url)
    return defragmented

remove url fragment like #sec1 and the parameters on url will keeped still.

def to_absolute_url(url, ref_url)
Expand source code
def to_absolute_url(url, ref_url):
    """
    给定好refUrl,利用urljoin就能得到绝对url
    refUrl: 除了绝对URL,其他URL都需要根据本URL所在的文章的Url也就是refUrl
            才能得到绝对URL

    如果是爬虫,一开始就将遇到的URL转成绝对URL可能是一个好的选择,但其他文档处理情况则
    不能这样简单处理,需要小心地根据URL的各种类型来采取不同的处理策略。
    """
    return urljoin(ref_url, url)

给定好refUrl,利用urljoin就能得到绝对url refUrl: 除了绝对URL,其他URL都需要根据本URL所在的文章的Url也就是refUrl 才能得到绝对URL

如果是爬虫,一开始就将遇到的URL转成绝对URL可能是一个好的选择,但其他文档处理情况则 不能这样简单处理,需要小心地根据URL的各种类型来采取不同的处理策略。

Classes

class URLType (*args, **kwds)
Expand source code
class URLType(Enum):
    """
    refUrl: 除了Absolute URL,其他URL都需要根据本URL所在的文章的refUrl才能得到绝对URL
    """
    Absolute = 1
    # 'https://www.cis.rit.edu/htbooks/nmr/chap-10/chap-10.htm'
    MissScheme = 2
    # ’//www.cis.rit.edu/htbooks/nmr/chap-10/chap-10.htm‘ refUrl
    RelativeSite = 3
    # ’/htbooks/nmr/chap-10/chap-10.htm‘ refUrl
    RelativeFolder = 4
    # ’chap-10.html‘ refUrl
    RelativeArticle = 5
    # ’#sec1‘
    InValid = 6

refUrl: 除了Absolute URL,其他URL都需要根据本URL所在的文章的refUrl才能得到绝对URL

Ancestors

  • enum.Enum

Class variables

var Absolute

The type of the None singleton.

var InValid

The type of the None singleton.

var MissScheme

The type of the None singleton.

var RelativeArticle

The type of the None singleton.

var RelativeFolder

The type of the None singleton.

var RelativeSite

The type of the None singleton.