Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 53 additions & 4 deletions mediawikiapi/mediawikiapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@ def __init__(self, config: Optional[Config] = None) -> None:

@memorized
def search(
self, query: str, results: int = 10, suggestion: bool = False
self,
query: str,
results: int = 10,
suggestion: bool = False,
follow_continue: bool = False,
) -> Union[List[str], Tuple[List[Any], Optional[List[str]]]]:
"""
Do a Wikipedia search for `query`.
Expand All @@ -26,6 +30,7 @@ def search(

* results - the maxmimum number of results returned
* suggestion - if True, return results and suggestion (if any) in a tuple
* follow_continue - if True, automatically follow continuation tokens to get all results
"""
search_params = {
"list": "search",
Expand All @@ -37,7 +42,9 @@ def search(
if suggestion:
search_params["srinfo"] = "suggestion"

raw_results = self.session.request(search_params, self.config)
raw_results = self.session.request(
search_params, self.config, follow_continue=follow_continue
)

if "error" in raw_results:
if raw_results["error"]["info"] in (
Expand Down Expand Up @@ -69,6 +76,7 @@ def geosearch(
title: Optional[str] = None,
results: int = 10,
radius: int = 1000,
follow_continue: bool = False,
) -> List[str]:
"""
Do a wikipedia geo search for `latitude` and `longitude`
Expand All @@ -84,6 +92,7 @@ def geosearch(
* title - The title of an article to search for
* results - the maximum number of results returned
* radius - Search radius in meters. The value must be between 10 and 10000
* follow_continue - if True, automatically follow continuation tokens to get all results
"""
search_params = {
"list": "geosearch",
Expand All @@ -94,7 +103,9 @@ def geosearch(
if title:
search_params["titles"] = title

raw_results = self.session.request(search_params, self.config)
raw_results = self.session.request(
search_params, self.config, follow_continue=follow_continue
)

if "error" in raw_results:
if raw_results["error"]["info"] in (
Expand Down Expand Up @@ -274,6 +285,7 @@ def category_members(
pageid: Optional[int] = None,
cmlimit: int = 10,
cmtype: str = "page",
follow_continue: bool = False,
) -> List[str]:
"""
Get list of page titles belonging to a category.
Expand All @@ -283,6 +295,7 @@ def category_members(
* pageid - page id of category page. Cannot be used together with "title"
* cmlimit - the maximum number of titles to return
* cmtype - which type of page to include. ("page", "subcat", or "file")
* follow_continue - if True, automatically follow continuation tokens to get all results
"""
if title is not None and pageid is not None:
raise ValueError(
Expand All @@ -305,7 +318,9 @@ def category_members(
else:
raise ValueError("Either a category or a pageid must be specified")

response = self.session.request(query_params, self.config)
response = self.session.request(
query_params, self.config, follow_continue=follow_continue
)
if "error" in response:
raise ValueError(response["error"].get("info"))
return [member["title"] for member in response["query"]["categorymembers"]]
Expand All @@ -317,3 +332,37 @@ def donate(self) -> None:
import webbrowser

webbrowser.open(Config().donate_url(), new=2)

def custom_query(
self, query_params: Dict[str, Any], follow_continue: bool = True
) -> Dict[str, Any]:
"""
Make a custom query to the Wikipedia API with the given parameters.

This method is useful for complex queries that aren't covered by the standard methods,
especially those that may return large amounts of data requiring continuation tokens.

Arguments:
* query_params - A dictionary of query parameters to pass to the API
* follow_continue - If True, automatically follow continuation tokens to get all results

Returns:
* The raw API response as a dictionary

Example:
```python
# Query that uses geosearch with pageviews property
params = {
"action": "query",
"generator": "geosearch",
"ggsradius": 10000,
"ggscoord": "40.7128|-74.0060", # New York coordinates
"ggslimit": 50,
"prop": "pageviews",
}
result = mediawikiapi.custom_query(params)
```
"""
return self.session.request(
query_params, self.config, follow_continue=follow_continue
)
90 changes: 89 additions & 1 deletion mediawikiapi/requestsession.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def request(
params: Dict[str, Any],
config: Config,
language: Optional[Union[str, Language]] = None,
follow_continue: bool = False,
) -> Dict[str, Any]:
"""
Make a request to the Wikipedia API using the given search parameters,
Expand All @@ -43,6 +44,7 @@ def request(
Keyword arguments:

* language - the wiki language
* follow_continue - if True, automatically follow 'continue' tokens to get all results

"""
params["format"] = "json"
Expand Down Expand Up @@ -72,4 +74,90 @@ def request(
)

data: Dict[str, Any] = r.json()
return data

# If follow_continue is False or there's no continue token, return the data as is
if not follow_continue or "continue" not in data:
return data

# If follow_continue is True, handle continuation
result = data # Start with the initial result

# Continue requesting while there's a continue token
while "continue" in result:
# Copy the original parameters and update with continue tokens
continue_params = params.copy()
continue_params.update(result["continue"])

# Respect rate limits
if (
self.__rate_limit_last_call
and config.rate_limit
and (self.__rate_limit_last_call + config.rate_limit) > datetime.now()
):
wait_time = (
self.__rate_limit_last_call + config.rate_limit
) - datetime.now()
if wait_time.total_seconds() > 0:
time.sleep(int(wait_time.total_seconds()))

# Make the continuation request
r = self.session.get(
config.get_api_url(language),
params=continue_params,
headers=headers,
timeout=config.timeout,
)
self.__rate_limit_last_call = datetime.now()

# Get the continued data
continued_data = r.json()

# Merge the data from the continued request with the initial result
if "query" in continued_data:
# Handle pages
if "pages" in continued_data.get("query", {}) and "pages" in result.get(
"query", {}
):
for pageid, page_data in continued_data["query"]["pages"].items():
if pageid in result["query"]["pages"]:
# Page exists in the result, merge properties
for prop, value in page_data.items():
if prop in result["query"]["pages"][pageid]:
# If the property is a list, extend it
if isinstance(value, list) and isinstance(
result["query"]["pages"][pageid][prop], list
):
result["query"]["pages"][pageid][prop].extend(
value
)
else:
# Otherwise, replace it
result["query"]["pages"][pageid][prop] = value
else:
# Property doesn't exist in the result, add it
result["query"]["pages"][pageid][prop] = value
else:
# Page doesn't exist in the result, add it
result["query"]["pages"][pageid] = page_data

# Handle lists in the query (like search results, backlinks, etc.)
for prop, value in continued_data["query"].items():
if prop != "pages":
if prop not in result["query"]:
result["query"][prop] = value
elif isinstance(value, list) and isinstance(
result["query"][prop], list
):
# If the property is a list, extend it
result["query"][prop].extend(value)

# Update the continue token
if "continue" in continued_data:
result["continue"] = continued_data["continue"]
else:
# No more continue tokens, we're done
if "continue" in result:
del result["continue"]
break

return result
Loading
Loading