Refactored scraper code to support multiple requests, even multiple requests mixed between scrapers.

This commit is contained in:
Aloshi 2014-06-03 18:30:03 -05:00
parent 40ca44e54f
commit 8c0a40cebb
12 changed files with 187 additions and 186 deletions

View file

@ -1,5 +1,7 @@
cmake_minimum_required(VERSION 2.6) cmake_minimum_required(VERSION 2.6)
INCLUDE(CPack)
project(emulationstation) project(emulationstation)
#------------------------------------------------------------------------------- #-------------------------------------------------------------------------------

View file

@ -3,7 +3,6 @@
#include "pugiXML/pugixml.hpp" #include "pugiXML/pugixml.hpp"
#include "platform.h" #include "platform.h"
#include <boost/filesystem.hpp> #include <boost/filesystem.hpp>
#include "scrapers/GamesDBScraper.h"
Settings* Settings::sInstance = NULL; Settings* Settings::sInstance = NULL;
@ -50,8 +49,7 @@ void Settings::setDefaults()
mStringMap["TransitionStyle"] = "fade"; mStringMap["TransitionStyle"] = "fade";
mStringMap["ThemeSet"] = ""; mStringMap["ThemeSet"] = "";
mStringMap["ScreenSaverBehavior"] = "dim"; mStringMap["ScreenSaverBehavior"] = "dim";
mStringMap["Scraper"] = "TheGamesDB";
mScraper = std::shared_ptr<Scraper>(new GamesDBScraper());
} }
template <typename K, typename V> template <typename K, typename V>
@ -83,9 +81,6 @@ void Settings::saveFile()
node.append_attribute("value").set_value(iter->second.c_str()); node.append_attribute("value").set_value(iter->second.c_str());
} }
pugi::xml_node scraperNode = doc.append_child("scraper");
scraperNode.append_attribute("value").set_value(mScraper->getName());
doc.save_file(path.c_str()); doc.save_file(path.c_str());
} }
@ -112,23 +107,6 @@ void Settings::loadFile()
setFloat(node.attribute("name").as_string(), node.attribute("value").as_float()); setFloat(node.attribute("name").as_string(), node.attribute("value").as_float());
for(pugi::xml_node node = doc.child("string"); node; node = node.next_sibling("string")) for(pugi::xml_node node = doc.child("string"); node; node = node.next_sibling("string"))
setString(node.attribute("name").as_string(), node.attribute("value").as_string()); setString(node.attribute("name").as_string(), node.attribute("value").as_string());
if(doc.child("scraper"))
{
std::shared_ptr<Scraper> scr = createScraperByName(doc.child("scraper").attribute("value").as_string());
if(scr)
mScraper = scr;
}
}
std::shared_ptr<Scraper> Settings::getScraper()
{
return mScraper;
}
void Settings::setScraper(std::shared_ptr<Scraper> scraper)
{
mScraper = scraper;
} }
//Print a warning message if the setting we're trying to get doesn't already exist in the map, then return the value in the map. //Print a warning message if the setting we're trying to get doesn't already exist in the map, then return the value in the map.

View file

@ -1,9 +1,6 @@
#ifndef _SETTINGS_H_ #pragma once
#define _SETTINGS_H_
#include <string> #include <string>
#include <map> #include <map>
#include "scrapers/Scraper.h"
//This is a singleton for storing settings. //This is a singleton for storing settings.
class Settings class Settings
@ -25,9 +22,6 @@ public:
void setFloat(const std::string& name, float value); void setFloat(const std::string& name, float value);
void setString(const std::string& name, const std::string& value); void setString(const std::string& name, const std::string& value);
std::shared_ptr<Scraper> getScraper();
void setScraper(std::shared_ptr<Scraper> scraper);
private: private:
static Settings* sInstance; static Settings* sInstance;
@ -41,8 +35,5 @@ private:
std::map<std::string, float> mFloatMap; std::map<std::string, float> mFloatMap;
std::map<std::string, std::string> mStringMap; std::map<std::string, std::string> mStringMap;
std::shared_ptr<Scraper> mScraper;
std::string mHomePathOverride; std::string mHomePathOverride;
}; };
#endif

View file

@ -212,7 +212,7 @@ void ScraperSearchComponent::search(const ScraperSearchParams& params)
updateInfoPane(); updateInfoPane();
mLastSearch = params; mLastSearch = params;
mSearchHandle = Settings::getInstance()->getScraper()->getResultsAsync(params); mSearchHandle = startScraperSearch(params);
} }
void ScraperSearchComponent::stop() void ScraperSearchComponent::stop()

View file

@ -38,16 +38,13 @@ GuiMenu::GuiMenu(Window* window) : GuiComponent(window), mMenu(window, "MAIN MEN
auto s = new GuiSettings(mWindow, "SCRAPER"); auto s = new GuiSettings(mWindow, "SCRAPER");
// scrape from // scrape from
auto scraper_list = std::make_shared< OptionListComponent< std::shared_ptr<Scraper> > >(mWindow, "SCRAPE FROM", false); auto scraper_list = std::make_shared< OptionListComponent< std::string > >(mWindow, "SCRAPE FROM", false);
std::vector< std::shared_ptr<Scraper> > scrapers; std::vector<std::string> scrapers = getScraperList();
scrapers.push_back(std::make_shared<GamesDBScraper>());
scrapers.push_back(std::make_shared<TheArchiveScraper>());
for(auto it = scrapers.begin(); it != scrapers.end(); it++) for(auto it = scrapers.begin(); it != scrapers.end(); it++)
scraper_list->add((*it)->getName(), *it, (*it)->getName() == Settings::getInstance()->getScraper()->getName()); scraper_list->add(*it, *it, *it == Settings::getInstance()->getString("Scraper"));
s->addWithLabel("SCRAPE FROM", scraper_list); s->addWithLabel("SCRAPE FROM", scraper_list);
s->addSaveFunc([scraper_list] { Settings::getInstance()->setScraper(scraper_list->getSelected()); }); s->addSaveFunc([scraper_list] { Settings::getInstance()->setString("Scraper", scraper_list->getSelected()); });
// scrape ratings // scrape ratings
auto scrape_ratings = std::make_shared<SwitchComponent>(mWindow); auto scrape_ratings = std::make_shared<SwitchComponent>(mWindow);

View file

@ -1,4 +1,5 @@
#include "GuiSettings.h" #include "GuiSettings.h"
#include "../Window.h"
#include "../Settings.h" #include "../Settings.h"
#include "../views/ViewController.h" #include "../views/ViewController.h"

View file

@ -1,14 +1,12 @@
#include "GamesDBScraper.h" #include "GamesDBScraper.h"
#include "../components/ScraperSearchComponent.h" #include "../components/ScraperSearchComponent.h"
#include "../components/AsyncReqComponent.h" #include "Scraper.h"
#include "../Log.h" #include "../Log.h"
#include "../pugiXML/pugixml.hpp" #include "../pugiXML/pugixml.hpp"
#include "../MetaData.h" #include "../MetaData.h"
#include "../Settings.h" #include "../Settings.h"
#include <boost/assign.hpp> #include <boost/assign.hpp>
const char* GamesDBScraper::getName() { return "TheGamesDB"; }
using namespace PlatformIds; using namespace PlatformIds;
const std::map<PlatformId, const char*> gamesdb_platformid_map = boost::assign::map_list_of const std::map<PlatformId, const char*> gamesdb_platformid_map = boost::assign::map_list_of
(THREEDO, "3DO") (THREEDO, "3DO")
@ -61,14 +59,15 @@ const std::map<PlatformId, const char*> gamesdb_platformid_map = boost::assign::
(ZX_SPECTRUM, "Sinclair ZX Spectrum"); (ZX_SPECTRUM, "Sinclair ZX Spectrum");
std::unique_ptr<ScraperSearchHandle> GamesDBScraper::getResultsAsync(const ScraperSearchParams& params) void thegamesdb_generate_scraper_requests(const ScraperSearchParams& params, std::queue< std::unique_ptr<ScraperRequest> >& requests,
std::vector<ScraperSearchResult>& results)
{ {
std::string path = "/api/GetGame.php?"; std::string path = "thegamesdb.net/api/GetGame.php?";
std::string cleanName = params.nameOverride; std::string cleanName = params.nameOverride;
if(cleanName.empty()) if(cleanName.empty())
cleanName = params.game->getCleanName(); cleanName = params.game->getCleanName();
path += "name=" + HttpReq::urlEncode(cleanName); path += "name=" + HttpReq::urlEncode(cleanName);
if(params.system->getPlatformId() != PLATFORM_UNKNOWN) if(params.system->getPlatformId() != PLATFORM_UNKNOWN)
@ -78,58 +77,33 @@ std::unique_ptr<ScraperSearchHandle> GamesDBScraper::getResultsAsync(const Scrap
{ {
path += "&platform="; path += "&platform=";
path += HttpReq::urlEncode(platformIt->second); path += HttpReq::urlEncode(platformIt->second);
}else{ }
else{
LOG(LogWarning) << "TheGamesDB scraper warning - no support for platform " << getPlatformName(params.system->getPlatformId()); LOG(LogWarning) << "TheGamesDB scraper warning - no support for platform " << getPlatformName(params.system->getPlatformId());
} }
} }
path = "thegamesdb.net" + path; requests.push(std::unique_ptr<ScraperRequest>(new ScraperHttpRequest(results, path, &thegamesdb_process_httpreq)));
return std::unique_ptr<ScraperSearchHandle>(new GamesDBHandle(params, path));
} }
GamesDBHandle::GamesDBHandle(const ScraperSearchParams& params, const std::string& url) : void thegamesdb_process_httpreq(const std::unique_ptr<HttpReq>& req, std::vector<ScraperSearchResult>& results)
mReq(std::unique_ptr<HttpReq>(new HttpReq(url)))
{ {
setStatus(ASYNC_IN_PROGRESS); assert(req->status() == HttpReq::REQ_SUCCESS);
}
void GamesDBHandle::update()
{
if(mStatus == ASYNC_DONE)
return;
if(mReq->status() == HttpReq::REQ_IN_PROGRESS)
return;
if(mReq->status() != HttpReq::REQ_SUCCESS)
{
std::stringstream ss;
ss << "Network error - " << mReq->getErrorMsg();
setError(ss.str());
return;
}
// our HTTP request was successful
// try to build our result list
std::vector<ScraperSearchResult> results;
pugi::xml_document doc; pugi::xml_document doc;
pugi::xml_parse_result parseResult = doc.load(mReq->getContent().c_str()); pugi::xml_parse_result parseResult = doc.load(req->getContent().c_str());
if(!parseResult) if(!parseResult)
{ {
setError("Error parsing XML"); LOG(LogError) << "GamesDBRequest - Error parsing XML. \n\t" << parseResult.description() << "";
return; return;
} }
pugi::xml_node data = doc.child("Data"); pugi::xml_node data = doc.child("Data");
std::string baseImageUrl = data.child("baseImgUrl").text().get(); std::string baseImageUrl = data.child("baseImgUrl").text().get();
unsigned int resultNum = 0;
pugi::xml_node game = data.child("Game"); pugi::xml_node game = data.child("Game");
while(game && resultNum < MAX_SCRAPER_RESULTS) while(game && results.size() < MAX_SCRAPER_RESULTS)
{ {
ScraperSearchResult result; ScraperSearchResult result;
@ -166,12 +140,6 @@ void GamesDBHandle::update()
} }
results.push_back(result); results.push_back(result);
resultNum++;
game = game.next_sibling("Game"); game = game.next_sibling("Game");
} }
setStatus(ASYNC_DONE);
setResults(results);
return;
} }

View file

@ -1,24 +1,8 @@
#pragma once #pragma once
#include "Scraper.h" #include "Scraper.h"
#include "../HttpReq.h"
class GamesDBHandle : public ScraperSearchHandle void thegamesdb_generate_scraper_requests(const ScraperSearchParams& params, std::queue< std::unique_ptr<ScraperRequest> >& requests,
{ std::vector<ScraperSearchResult>& results);
public:
GamesDBHandle(const ScraperSearchParams& params, const std::string& url);
void update() override; void thegamesdb_process_httpreq(const std::unique_ptr<HttpReq>& req, std::vector<ScraperSearchResult>& results);
private:
std::unique_ptr<HttpReq> mReq;
ScraperSearchParams mParams;
};
class GamesDBScraper : public Scraper
{
public:
std::unique_ptr<ScraperSearchHandle> getResultsAsync(const ScraperSearchParams& params) override;
const char* getName();
};

View file

@ -4,21 +4,90 @@
#include "../Settings.h" #include "../Settings.h"
#include <FreeImage.h> #include <FreeImage.h>
#include <boost/filesystem.hpp> #include <boost/filesystem.hpp>
#include <boost/regex.hpp> #include <boost/assign.hpp>
#include "GamesDBScraper.h" #include "GamesDBScraper.h"
#include "TheArchiveScraper.h" #include "TheArchiveScraper.h"
std::shared_ptr<Scraper> createScraperByName(const std::string& name) const std::map<std::string, generate_scraper_requests_func> scraper_request_funcs = boost::assign::map_list_of
{ ("TheGamesDB", &thegamesdb_generate_scraper_requests)
if(name == "TheGamesDB") ("TheArchive", &thearchive_generate_scraper_requests);
return std::shared_ptr<Scraper>(new GamesDBScraper());
else if(name == "TheArchive")
return std::shared_ptr<Scraper>(new TheArchiveScraper());
return nullptr; std::unique_ptr<ScraperSearchHandle> startScraperSearch(const ScraperSearchParams& params)
{
const std::string& name = Settings::getInstance()->getString("Scraper");
std::unique_ptr<ScraperSearchHandle> handle(new ScraperSearchHandle());
scraper_request_funcs.at(name)(params, handle->mRequestQueue, handle->mResults);
return handle;
} }
std::vector<std::string> getScraperList()
{
std::vector<std::string> list;
for(auto it = scraper_request_funcs.begin(); it != scraper_request_funcs.end(); it++)
{
list.push_back(it->first);
}
return list;
}
// ScraperSearchHandle
ScraperSearchHandle::ScraperSearchHandle()
{
setStatus(ASYNC_IN_PROGRESS);
}
void ScraperSearchHandle::update()
{
if(mStatus == ASYNC_DONE)
return;
while(!mRequestQueue.empty() && mRequestQueue.front()->update())
mRequestQueue.pop();
if(mRequestQueue.empty())
{
setStatus(ASYNC_DONE);
return;
}
}
// ScraperRequest
ScraperRequest::ScraperRequest(std::vector<ScraperSearchResult>& resultsWrite) : mResults(resultsWrite)
{
}
// ScraperHttpRequest
ScraperHttpRequest::ScraperHttpRequest(std::vector<ScraperSearchResult>& resultsWrite, const std::string& url, scraper_process_httpreq processFunc)
: ScraperRequest(resultsWrite), mProcessFunc(processFunc)
{
mReq = std::unique_ptr<HttpReq>(new HttpReq(url));
}
bool ScraperHttpRequest::update()
{
if(mReq->status() == HttpReq::REQ_SUCCESS)
{
mProcessFunc(mReq, mResults);
return true;
}
if(mReq->status() == HttpReq::REQ_IN_PROGRESS)
return false;
// everything else is some sort of error
LOG(LogError) << "ScraperHttpRequest network error - " << mReq->getErrorMsg();
return true;
}
// metadata resolving stuff
std::unique_ptr<MDResolveHandle> resolveMetaDataAssets(const ScraperSearchResult& result, const ScraperSearchParams& search) std::unique_ptr<MDResolveHandle> resolveMetaDataAssets(const ScraperSearchResult& result, const ScraperSearchParams& search)
{ {
return std::unique_ptr<MDResolveHandle>(new MDResolveHandle(result, search)); return std::unique_ptr<MDResolveHandle>(new MDResolveHandle(result, search));

View file

@ -6,6 +6,7 @@
#include "../AsyncHandle.h" #include "../AsyncHandle.h"
#include <vector> #include <vector>
#include <functional> #include <functional>
#include <queue>
struct ScraperSearchParams struct ScraperSearchParams
{ {
@ -24,29 +25,87 @@ struct ScraperSearchResult
std::string thumbnailUrl; std::string thumbnailUrl;
}; };
// So let me explain why I've abstracted this so heavily.
// There are two ways I can think of that you'd want to write a scraper.
// 1. Do some HTTP request(s) -> process it -> return the results
// 2. Do some local filesystem queries (an offline scraper) -> return the results
// The first way needs to be asynchronous while it's waiting for the HTTP request to return.
// The second doesn't.
// It would be nice if we could write it like this:
// search = generate_http_request(searchparams);
// wait_until_done(search);
// ... process search ...
// return results;
// We could do this if we used threads. Right now ES doesn't because I'm pretty sure I'll fuck it up,
// and I'm not sure of the performance of threads on the Pi (single-core ARM).
// We could also do this if we used coroutines.
// I can't find a really good cross-platform coroutine library (x86/64/ARM Linux + Windows),
// and I don't want to spend more time chasing libraries than just writing it the long way once.
// So, I did it the "long" way.
// ScraperSearchHandle - one logical search, e.g. "search for mario"
// ScraperRequest - encapsulates some sort of asynchronous request that will ultimately return some results
// ScraperHttpRequest - implementation of ScraperRequest that waits on an HttpReq, then processes it with some processing function.
// a scraper search gathers results from (potentially multiple) ScraperRequests
class ScraperRequest
{
public:
ScraperRequest(std::vector<ScraperSearchResult>& resultsWrite);
// returns "true" once we're done
virtual bool update() = 0;
protected:
std::vector<ScraperSearchResult>& mResults;
};
typedef void (*scraper_process_httpreq)(const std::unique_ptr<HttpReq>& req, std::vector<ScraperSearchResult>& results);
// a single HTTP request that needs to be processed to get the results
class ScraperHttpRequest : ScraperRequest
{
public:
ScraperHttpRequest(std::vector<ScraperSearchResult>& resultsWrite, const std::string& url, scraper_process_httpreq processFunc);
bool update() override;
private:
scraper_process_httpreq mProcessFunc;
std::unique_ptr<HttpReq> mReq;
};
// a request to get a list of results
class ScraperSearchHandle : public AsyncHandle class ScraperSearchHandle : public AsyncHandle
{ {
public: public:
virtual void update() = 0; ScraperSearchHandle();
void update();
inline const std::vector<ScraperSearchResult>& getResults() const { assert(mStatus != ASYNC_IN_PROGRESS); return mResults; } inline const std::vector<ScraperSearchResult>& getResults() const { assert(mStatus != ASYNC_IN_PROGRESS); return mResults; }
protected: protected:
inline void setResults(const std::vector<ScraperSearchResult>& results) { mResults = results; } friend std::unique_ptr<ScraperSearchHandle> startScraperSearch(const ScraperSearchParams& params);
private: std::queue< std::unique_ptr<ScraperRequest> > mRequestQueue;
std::vector<ScraperSearchResult> mResults; std::vector<ScraperSearchResult> mResults;
}; };
class Scraper // will use the current scraper settings to pick the result source
{ std::unique_ptr<ScraperSearchHandle> startScraperSearch(const ScraperSearchParams& params);
public:
//Get a list of potential results.
virtual std::unique_ptr<ScraperSearchHandle> getResultsAsync(const ScraperSearchParams& params) = 0;
virtual const char* getName() = 0; // returns a list of valid scraper names
}; std::vector<std::string> getScraperList();
typedef void (*generate_scraper_requests_func)(const ScraperSearchParams& params, std::queue< std::unique_ptr<ScraperRequest> >& requests, std::vector<ScraperSearchResult>& results);
// -------------------------------------------------------------------------
std::shared_ptr<Scraper> createScraperByName(const std::string& name);
// Meta data asset downloading stuff. // Meta data asset downloading stuff.

View file

@ -4,64 +4,37 @@
#include "../Log.h" #include "../Log.h"
#include "../pugiXML/pugixml.hpp" #include "../pugiXML/pugixml.hpp"
const char* TheArchiveScraper::getName() { return "TheArchive"; } void thearchive_generate_scraper_requests(const ScraperSearchParams& params, std::queue< std::unique_ptr<ScraperRequest> >& requests,
std::vector<ScraperSearchResult>& results)
std::unique_ptr<ScraperSearchHandle> TheArchiveScraper::getResultsAsync(const ScraperSearchParams& params)
{ {
std::string path = "/2.0/Archive.search/xml/7TTRM4MNTIKR2NNAGASURHJOZJ3QXQC5/"; std::string path = "api.archive.vg/2.0/Archive.search/xml/7TTRM4MNTIKR2NNAGASURHJOZJ3QXQC5/";
std::string cleanName = params.nameOverride; std::string cleanName = params.nameOverride;
if(cleanName.empty()) if(cleanName.empty())
cleanName = params.game->getCleanName(); cleanName = params.game->getCleanName();
path += HttpReq::urlEncode(cleanName); path += HttpReq::urlEncode(cleanName);
//platform TODO, should use some params.system get method //platform TODO, should use some params.system get method
path = "api.archive.vg" + path; requests.push(std::unique_ptr<ScraperRequest>(new ScraperHttpRequest(results, path, &thearchive_process_httpreq)));
return std::unique_ptr<ScraperSearchHandle>(new TheArchiveHandle(params, path));
} }
TheArchiveHandle::TheArchiveHandle(const ScraperSearchParams& params, const std::string& url) : void thearchive_process_httpreq(const std::unique_ptr<HttpReq>& req, std::vector<ScraperSearchResult>& results)
mReq(std::unique_ptr<HttpReq>(new HttpReq(url)))
{ {
setStatus(ASYNC_IN_PROGRESS); assert(req->status() == HttpReq::REQ_SUCCESS);
}
void TheArchiveHandle::update()
{
if(mStatus == ASYNC_DONE)
return;
if(mReq->status() == HttpReq::REQ_IN_PROGRESS)
return;
if(mReq->status() != HttpReq::REQ_SUCCESS)
{
std::stringstream ss;
ss << "Network error: " << mReq->getErrorMsg();
setError(ss.str());
return;
}
// if we're here, our HTTP request finished successfully
// so, let's try building our result list
std::vector<ScraperSearchResult> results;
pugi::xml_document doc; pugi::xml_document doc;
pugi::xml_parse_result parseResult = doc.load(mReq->getContent().c_str()); pugi::xml_parse_result parseResult = doc.load(req->getContent().c_str());
if(!parseResult) if(!parseResult)
{ {
setError("Error parsing XML"); LOG(LogError) << "TheArchiveRequest - error parsing XML.\n\t" << parseResult.description();
return; return;
} }
pugi::xml_node data = doc.child("OpenSearchDescription").child("games"); pugi::xml_node data = doc.child("OpenSearchDescription").child("games");
unsigned int resultNum = 0;
pugi::xml_node game = data.child("game"); pugi::xml_node game = data.child("game");
while(game && resultNum < MAX_SCRAPER_RESULTS) while(game && results.size() < MAX_SCRAPER_RESULTS)
{ {
ScraperSearchResult result; ScraperSearchResult result;
@ -86,11 +59,6 @@ void TheArchiveHandle::update()
result.thumbnailUrl = thumbnail.text().get(); result.thumbnailUrl = thumbnail.text().get();
results.push_back(result); results.push_back(result);
resultNum++;
game = game.next_sibling("game"); game = game.next_sibling("game");
} }
setStatus(ASYNC_DONE);
setResults(results);
} }

View file

@ -1,24 +1,8 @@
#pragma once #pragma once
#include "Scraper.h" #include "Scraper.h"
#include "../HttpReq.h"
class TheArchiveHandle : public ScraperSearchHandle void thearchive_generate_scraper_requests(const ScraperSearchParams& params, std::queue< std::unique_ptr<ScraperRequest> >& requests,
{ std::vector<ScraperSearchResult>& results);
public:
TheArchiveHandle(const ScraperSearchParams& params, const std::string& url);
void update() override; void thearchive_process_httpreq(const std::unique_ptr<HttpReq>& req, std::vector<ScraperSearchResult>& results);
private:
std::unique_ptr<HttpReq> mReq;
ScraperSearchParams mParams;
};
class TheArchiveScraper : public Scraper
{
public:
std::unique_ptr<ScraperSearchHandle> getResultsAsync(const ScraperSearchParams& params) override;
const char* getName();
};