data.extraction.web
Data - Analysis - Web¤
get_page_content(link, session=None, session_query_configs=None, method='GET', data=None)
¤
Download page and save content
Parameters:
Name | Type | Description | Default |
---|---|---|---|
headers |
dict, optional |
header information such as useragent, defaults to random user agent from get_random_user_agent |
required |
Source code in dietbox/data/extraction/web.py
def get_page_content(
link, session=None, session_query_configs=None, method="GET", data=None
):
"""Download page and save content
:param headers: header information such as useragent, defaults to random user agent from get_random_user_agent
:type headers: dict, optional
"""
if not session_query_configs:
session_query_configs = get_session_query_configs()
if not session:
session = get_session(
retry_params=None,
session=None,
)
if method == "GET":
content = session.get(link, **session_query_configs)
elif method == "POST":
if data is None:
data = {}
content = session.post(link, data=data, **session_query_configs)
status = content.status_code
return {"status": status, "content": content}
get_random_user_agent(browsers=None)
¤
get_random_user_agent returns a random user agent.
We provide two predefined browers, chrome and firefox.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
browsers |
list, optional |
which brower to be used, defaults to ["chrome", "firefox"] |
None |
Returns:
Type | Description |
---|---|
dict |
dictionary for requests module to consude as {'User-Agent': "blabla"} |
Source code in dietbox/data/extraction/web.py
def get_random_user_agent(browsers=None):
"""
get_random_user_agent returns a random user agent.
We provide two predefined browers, chrome and firefox.
:param browsers: which brower to be used, defaults to ["chrome", "firefox"]
:type browsers: list, optional
:return: dictionary for requests module to consude as {'User-Agent': "blabla"}
:rtype: dict
"""
if browsers is None:
browsers = ["chrome", "firefox"]
if isinstance(browsers, str):
browsers = [browsers]
chrome_user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
]
firefox_user_agents = [
"Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",
"Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)",
"Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)",
]
user_agents_dict = {"chrome": chrome_user_agents, "firefox": firefox_user_agents}
# error if specified browser is not in the list
if set(browsers) - set(user_agents_dict.keys()):
logger.error(f"Unknown browser: {set(browsers) - set(user_agents_dict.keys())}")
user_agent_list = sum([user_agents_dict[browser] for browser in browsers], [])
return {"User-Agent": random.choice(user_agent_list)}
get_session(retry_params=None, session=None)
¤
get_session prepares a session object.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
retry_params |
dict, optional |
the rules to retry, defaults to {"retries": 5, "backoff_factor": 0.3, "status_forcelist": (500, 502, 504)} |
None |
session |
[type], optional |
[description], defaults to None |
None |
Source code in dietbox/data/extraction/web.py
def get_session(
retry_params=None,
session=None,
):
"""
get_session prepares a session object.
:param retry_params: the rules to retry, defaults to {"retries": 5, "backoff_factor": 0.3, "status_forcelist": (500, 502, 504)}
:type retry_params: dict, optional
:param session: [description], defaults to None
:type session: [type], optional
"""
if retry_params is None:
retry_params = {
"retries": 5,
"backoff_factor": 0.3,
"status_forcelist": (500, 502, 504),
}
if session is None:
session = requests.Session()
retry = Retry(
total=retry_params.get("retries"),
read=retry_params.get("retries"),
connect=retry_params.get("retries"),
backoff_factor=retry_params.get("backoff_factor"),
status_forcelist=retry_params.get("status_forcelist"),
)
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
get_session_query_configs(headers=None, timeout=None, proxies=None, cookies=None)
¤
get_session_query_configs creates a session config dictionary for session to use. These are the keyword arguments of the session get or post methods.
Proxies can be set by providing a dictionary of the form
{
'http': some super_proxy_url,
'https': some super_proxy_url,
}
Parameters:
Name | Type | Description | Default |
---|---|---|---|
headers |
dict, optional |
header of the method such as use agent, defaults to random user agent from get_random_user_agent |
None |
timeout |
tuple, optional |
timeout strategy, defaults to (5, 14) |
None |
proxies |
dict, optional |
proxy configs, defaults to {} |
None |
cookies |
dict, optional |
cookie configs, defaults to {"language": "en"} |
None |
Returns:
Type | Description |
---|---|
dict |
dictionary of session configs for session methods, e.g., get, to use. |
Source code in dietbox/data/extraction/web.py
def get_session_query_configs(
headers=None,
timeout=None,
proxies=None,
cookies=None,
):
"""
get_session_query_configs creates a session config dictionary for session to use. These are the keyword arguments of the session get or post methods.
Proxies can be set by providing a dictionary of the form
```python
{
'http': some super_proxy_url,
'https': some super_proxy_url,
}
```
:param headers: header of the method such as use agent, defaults to random user agent from get_random_user_agent
:type headers: dict, optional
:param timeout: timeout strategy, defaults to (5, 14)
:type timeout: tuple, optional
:param proxies: proxy configs, defaults to {}
:type proxies: dict, optional
:param cookies: cookie configs, defaults to {"language": "en"}
:type cookies: dict, optional
:return: dictionary of session configs for session methods, e.g., get, to use.
:rtype: dict
"""
if cookies is None:
cookies = {"language": "en"}
if headers is None:
headers = get_random_user_agent()
if timeout is None:
timeout = (5, 14)
if proxies is None:
proxies = {}
return dict(headers=headers, proxies=proxies, cookies=cookies)