Add ScreenScraper.fr as scraping source

This commit is contained in:
Cristi Mitrana 2019-01-11 21:51:05 +02:00 committed by Cristi Mitrana
parent 6ffda17c4e
commit cdd43bf7e9
9 changed files with 459 additions and 9 deletions

View file

@ -40,6 +40,7 @@ set(ES_HEADERS
# Scrapers
${CMAKE_CURRENT_SOURCE_DIR}/src/scrapers/Scraper.h
${CMAKE_CURRENT_SOURCE_DIR}/src/scrapers/GamesDBScraper.h
${CMAKE_CURRENT_SOURCE_DIR}/src/scrapers/ScreenScraper.h
# Views
${CMAKE_CURRENT_SOURCE_DIR}/src/views/gamelist/BasicGameListView.h
@ -96,6 +97,7 @@ set(ES_SOURCES
# Scrapers
${CMAKE_CURRENT_SOURCE_DIR}/src/scrapers/Scraper.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/scrapers/GamesDBScraper.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/scrapers/ScreenScraper.cpp
# Views
${CMAKE_CURRENT_SOURCE_DIR}/src/views/gamelist/BasicGameListView.cpp

View file

@ -61,6 +61,8 @@ namespace PlatformIds
"psvita",
"psp", // playstation portable
"snes", // super nintendo entertainment system
"scummvm",
"x6800",
"pcengine", // (aka turbografx-16) HuCards only
"pcenginecd", // (aka turbografx-16) CD-ROMs only
"wonderswan",

View file

@ -62,6 +62,8 @@ namespace PlatformIds
PLAYSTATION_VITA,
PLAYSTATION_PORTABLE,
SUPER_NINTENDO,
SCUMMVM,
SHARP_X6800,
TURBOGRAFX_16, // (aka PC Engine) HuCards only
TURBOGRAFX_CD, // (aka PC Engine) CD-ROMs only
WONDERSWAN,

View file

@ -2,6 +2,7 @@
#include "FileData.h"
#include "GamesDBScraper.h"
#include "ScreenScraper.h"
#include "Log.h"
#include "Settings.h"
#include "SystemData.h"
@ -9,7 +10,8 @@
#include <fstream>
const std::map<std::string, generate_scraper_requests_func> scraper_request_funcs {
{ "TheGamesDB", &thegamesdb_generate_scraper_requests }
{ "TheGamesDB", &thegamesdb_generate_scraper_requests },
{ "ScreenScraper", &screenscraper_generate_scraper_requests }
};
std::unique_ptr<ScraperSearchHandle> startScraperSearch(const ScraperSearchParams& params)
@ -126,7 +128,23 @@ MDResolveHandle::MDResolveHandle(const ScraperSearchResult& result, const Scrape
{
if(!result.imageUrl.empty())
{
std::string imgPath = getSaveAsPath(search, "image", result.imageUrl);
std::string ext;
// If we have a file extension returned by the scraper, then use it.
// Otherwise, try to guess it by the name of the URL, which point to an image.
if (!result.imageType.empty())
{
ext = result.imageType;
}else{
size_t dot = result.imageUrl.find_last_of('.');
if (dot != std::string::npos)
ext = result.imageUrl.substr(dot, std::string::npos);
}
std::string imgPath = getSaveAsPath(search, "image", ext);
mFuncs.push_back(ResolvePair(downloadImageAsync(result.imageUrl, imgPath), [this, imgPath]
{
mResult.mdl.set("image", imgPath);
@ -269,7 +287,7 @@ bool resizeImage(const std::string& path, int maxWidth, int maxHeight)
return saved;
}
std::string getSaveAsPath(const ScraperSearchParams& params, const std::string& suffix, const std::string& url)
std::string getSaveAsPath(const ScraperSearchParams& params, const std::string& suffix, const std::string& extension)
{
const std::string subdirectory = params.system->getName();
const std::string name = Utils::FileSystem::getStem(params.game->getPath()) + "-" + suffix;
@ -284,11 +302,7 @@ std::string getSaveAsPath(const ScraperSearchParams& params, const std::string&
if(!Utils::FileSystem::exists(path))
Utils::FileSystem::createDirectory(path);
size_t dot = url.find_last_of('.');
std::string ext;
if(dot != std::string::npos)
ext = url.substr(dot, std::string::npos);
path += name + ext;
path += name + extension;
return path;
}

View file

@ -31,6 +31,9 @@ struct ScraperSearchResult
MetaDataList mdl;
std::string imageUrl;
std::string thumbnailUrl;
// Needed to pre-set the image type
std::string imageType;
};
// So let me explain why I've abstracted this so heavily.

View file

@ -0,0 +1,339 @@
#include "scrapers/ScreenScraper.h"
#include "utils/TimeUtil.h"
#include "utils/StringUtil.h"
#include "FileData.h"
#include "Log.h"
#include "PlatformId.h"
#include "Settings.h"
#include "SystemData.h"
#include <pugixml/src/pugixml.hpp>
#include <cstring>
using namespace PlatformIds;
/**
List of systems and their IDs from
https://www.screenscraper.fr/api/systemesListe.php?devid=xxx&devpassword=yyy&softname=zzz&output=XML
**/
const std::map<PlatformId, unsigned short> screenscraper_platformid_map{
{ THREEDO, 29 },
{ AMIGA, 64 },
{ AMSTRAD_CPC, 65 },
{ APPLE_II, 86 },
{ ARCADE, 75 },
{ ATARI_800, 26 }, // Use ATARI_2600 as an alias for atari 800
{ ATARI_2600, 26 },
{ ATARI_5200, 40 },
{ ATARI_7800, 41 },
{ ATARI_JAGUAR, 27 },
{ ATARI_JAGUAR_CD, 171 },
{ ATARI_LYNX, 28 },
{ ATARI_ST, 42},
// missing Atari XE ?
{ COLECOVISION, 48 },
{ COMMODORE_64, 66 },
{ INTELLIVISION, 115 },
{ MAC_OS, 146 },
{ XBOX, 32 },
{ XBOX_360, 33 },
{ MSX, 113 },
{ NEOGEO, 142 },
{ NEOGEO_POCKET, 25},
{ NEOGEO_POCKET_COLOR, 82 },
{ NINTENDO_3DS, 17 },
{ NINTENDO_64, 14 },
{ NINTENDO_DS, 15 },
{ FAMICOM_DISK_SYSTEM, 106 },
{ NINTENDO_ENTERTAINMENT_SYSTEM, 3 },
{ GAME_BOY, 9 },
{ GAME_BOY_ADVANCE, 12 },
{ GAME_BOY_COLOR, 10 },
{ NINTENDO_GAMECUBE, 13 },
{ NINTENDO_WII, 16 },
{ NINTENDO_WII_U, 18 },
{ NINTENDO_VIRTUAL_BOY, 11 },
{ NINTENDO_GAME_AND_WATCH, 52 },
{ PC, 135 },
{ SCUMMVM, 123},
{ SEGA_32X, 19 },
{ SEGA_CD, 20 },
{ SEGA_DREAMCAST, 23 },
{ SEGA_GAME_GEAR, 21 },
{ SEGA_GENESIS, 1 },
{ SEGA_MASTER_SYSTEM, 2 },
{ SEGA_MEGA_DRIVE, 1 },
{ SEGA_SATURN, 22 },
{ SEGA_SG1000, 109 },
{ SHARP_X6800, 79},
{ PLAYSTATION, 57 },
{ PLAYSTATION_2, 58 },
{ PLAYSTATION_3, 59 },
// missing Sony Playstation 4 ?
{ PLAYSTATION_VITA, 62 },
{ PLAYSTATION_PORTABLE, 61 },
{ SUPER_NINTENDO, 4 },
{ TURBOGRAFX_16, 31 },
{ TURBOGRAFX_CD, 114 },
{ WONDERSWAN, 45 },
{ WONDERSWAN_COLOR, 46 },
{ ZX_SPECTRUM, 76 },
{ VIDEOPAC_ODYSSEY2, 104 },
{ VECTREX, 102 },
{ TRS80_COLOR_COMPUTER, 144 },
{ TANDY, 144 }
};
// Helper XML parsing method, finding a node-by-name recursively.
pugi::xml_node find_node_by_name_re(const pugi::xml_node& node, const std::vector<std::string> node_names) {
for (const std::string& _val : node_names)
{
pugi::xpath_query query_node_name((static_cast<std::string>("//") + _val).c_str());
pugi::xpath_node_set results = node.select_nodes(query_node_name);
if (results.size() > 0)
return results.first().node();
}
return pugi::xml_node();
}
// Help XML parsing method, finding an direct child XML node starting from the parent and filtering by an attribute value list.
pugi::xml_node find_child_by_attribute_list(const pugi::xml_node& node_parent, const std::string& node_name, const std::string& attribute_name, const std::vector<std::string> attribute_values)
{
for (auto _val : attribute_values)
{
for (pugi::xml_node node : node_parent.children(node_name.c_str()))
{
if (strcmp(node.attribute(attribute_name.c_str()).value(), _val.c_str()) == 0)
return node;
}
}
return pugi::xml_node(NULL);
}
void screenscraper_generate_scraper_requests(const ScraperSearchParams& params,
std::queue< std::unique_ptr<ScraperRequest> >& requests,
std::vector<ScraperSearchResult>& results)
{
std::string path;
ScreenScraperRequest::ScreenScraperConfig ssConfig;
std::string cleanName = params.nameOverride;
if (cleanName.empty())
cleanName = params.game->getCleanName();
path = ssConfig.getGameSearchUrl(cleanName);
auto& platforms = params.system->getPlatformIds();
for (auto platformIt = platforms.cbegin(); platformIt != platforms.cend(); platformIt++)
{
auto mapIt = screenscraper_platformid_map.find(*platformIt);
if (mapIt != screenscraper_platformid_map.cend())
{
path += "&systemeid=";
path += HttpReq::urlEncode(std::to_string(mapIt->second));
}else{
LOG(LogWarning) << "ScreenScraper: no support for platform " << getPlatformName(*platformIt);
}
requests.push(std::unique_ptr<ScraperRequest>(new ScreenScraperRequest(requests, results, path)));
}
}
void ScreenScraperRequest::process(const std::unique_ptr<HttpReq>& req, std::vector<ScraperSearchResult>& results)
{
assert(req->status() == HttpReq::REQ_SUCCESS);
pugi::xml_document doc;
pugi::xml_parse_result parseResult = doc.load(req->getContent().c_str());
if (!parseResult)
{
std::stringstream ss;
ss << "ScreenScraperRequest - Error parsing XML." << std::endl << parseResult.description() << "";
std::string err = ss.str();
setError(err);
LOG(LogError) << err;
return;
}
processGame(doc, results);
}
void ScreenScraperRequest::processGame(const pugi::xml_document& xmldoc, std::vector<ScraperSearchResult>& results)
{
pugi::xml_node data = xmldoc.child("Data");
pugi::xml_node game = data.child("jeu");
if (game)
{
ScraperSearchResult result;
ScreenScraperRequest::ScreenScraperConfig ssConfig;
std::string region = Utils::String::toLower(ssConfig.region).c_str();
std::string language = Utils::String::toLower(ssConfig.language).c_str();
// Name fallback: US, WOR(LD). ( Xpath: Data/jeu[0]/noms/nom[*] ).
result.mdl.set("name", find_child_by_attribute_list(game.child("noms"), "nom", "region", { region, "wor", "us" , "ss", "eu", "jp" }).text().get());
// Description fallback language: EN, WOR(LD)
std::string description = find_child_by_attribute_list(game.child("synopsis"), "synopsis", "langue", { language, "en", "wor" }).text().get();
if (!description.empty()) {
result.mdl.set("desc", Utils::String::replace(description, "&nbsp;", " "));
}
// Genre fallback language: EN. ( Xpath: Data/jeu[0]/genres/genre[*] )
result.mdl.set("genre", find_child_by_attribute_list(game.child("genres"), "genre", "langue", { language, "en" }).text().get());
LOG(LogDebug) << "Genre: " << result.mdl.get("genre");
// Get the date proper. The API returns multiple 'date' children nodes to the 'dates' main child of 'jeu'.
// Date fallback: WOR(LD), US, SS, JP, EU
std::string _date = find_child_by_attribute_list(game.child("dates"), "date", "region", { region, "wor", "us", "ss", "jp", "eu" }).text().get();
LOG(LogDebug) << "Release Date (unparsed): " << _date;
// Date can be YYYY-MM-DD or just YYYY.
if (_date.length() > 4)
{
result.mdl.set("releasedate", Utils::Time::DateTime(Utils::Time::stringToTime(_date, "%Y-%m-%d")));
} else if (_date.length() > 0)
{
result.mdl.set("releasedate", Utils::Time::DateTime(Utils::Time::stringToTime(_date, "%Y")));
}
LOG(LogDebug) << "Release Date (parsed): " << result.mdl.get("releasedate");
/// Developer for the game( Xpath: Data/jeu[0]/developpeur )
std::string developer = game.child("developpeur").text().get();
if (!developer.empty())
result.mdl.set("developer", Utils::String::replace(developer, "&nbsp;", " "));
// Publisher for the game ( Xpath: Data/jeu[0]/editeur )
std::string publisher = game.child("editeur").text().get();
if (!publisher.empty())
result.mdl.set("publisher", Utils::String::replace(publisher, "&nbsp;", " "));
// Players
result.mdl.set("players", game.child("joueurs").text().get());
// TODO: Validate rating
if (Settings::getInstance()->getBool("ScrapeRatings") && game.child("note"))
{
float ratingVal = (game.child("note").text().as_int() / 20.0f);
std::stringstream ss;
ss << ratingVal;
result.mdl.set("rating", ss.str());
}
// Media super-node
pugi::xml_node media_list = game.child("medias");
if (media_list)
{
pugi::xml_node art = pugi::xml_node(NULL);
// Do an XPath query for media[type='$media_type'], then filter by region
// We need to do this because any child of 'medias' has the form
// <media type="..." region="..." format="...">
// and we need to find the right media for the region.
pugi::xpath_node_set results = media_list.select_nodes((static_cast<std::string>("media[@type='") + ssConfig.media_name + "']").c_str());
if (results.size())
{
// Region fallback: WOR(LD), US, CUS(TOM?), JP, EU
for (auto _region : std::vector<std::string>{ region, "wor", "us", "cus", "jp", "eu" })
{
if (art)
break;
for (auto node : results)
{
if (node.node().attribute("region").value() == _region)
{
art = node.node();
break;
}
}
}
} // results
if (art)
{
// Sending a 'softname' containing space will make the image URLs returned by the API also contain the space.
// Escape any spaces in the URL here
result.imageUrl = Utils::String::replace(art.text().get(), " ", "%20");
// Get the media type returned by ScreenScraper
std::string media_type = art.attribute("format").value();
if (!media_type.empty())
result.imageType = "." + media_type;
// Ask for the same image, but with a smaller size, for the thumbnail displayed during scraping
result.thumbnailUrl = result.imageUrl + "&maxheight=250";
}else{
LOG(LogDebug) << "Failed to find media XML node with name=" << ssConfig.media_name;
}
}
results.push_back(result);
} // game
}
// Currently not used in this module
void ScreenScraperRequest::processList(const pugi::xml_document& xmldoc, std::vector<ScraperSearchResult>& results)
{
assert(mRequestQueue != nullptr);
LOG(LogDebug) << "Processing a list of results";
pugi::xml_node data = xmldoc.child("Data");
pugi::xml_node game = data.child("jeu");
if (!game)
LOG(LogDebug) << "Found nothing";
ScreenScraperRequest::ScreenScraperConfig ssConfig;
// limit the number of results per platform, not in total.
// otherwise if the first platform returns >= 7 games
// but the second platform contains the relevant game,
// the relevant result would not be shown.
for (int i = 0; game && i < MAX_SCRAPER_RESULTS; i++)
{
std::string id = game.child("id").text().get();
std::string name = game.child("nom").text().get();
std::string platformId = game.child("systemeid").text().get();
std::string path = ssConfig.getGameSearchUrl(name) + "&systemeid=" + platformId + "&gameid=" + id;
mRequestQueue->push(std::unique_ptr<ScraperRequest>(new ScreenScraperRequest(results, path)));
game = game.next_sibling("jeu");
}
}
std::string ScreenScraperRequest::ScreenScraperConfig::getGameSearchUrl(const std::string gameName) const
{
return API_URL_BASE
+ "/jeuInfos.php?devid=" + Utils::String::scramble(API_DEV_U, API_DEV_KEY)
+ "&devpassword=" + Utils::String::scramble(API_DEV_P, API_DEV_KEY)
+ "&softname=" + HttpReq::urlEncode(API_SOFT_NAME)
+ "&output=xml"
+ "&romnom=" + HttpReq::urlEncode(gameName);
}

View file

@ -0,0 +1,73 @@
#pragma once
#ifndef ES_APP_SCRAPERS_SCREEN_SCRAPER_H
#define ES_APP_SCRAPERS_SCREEN_SCRAPER_H
#include "scrapers/Scraper.h"
#include "EmulationStation.h"
namespace pugi { class xml_document; }
void screenscraper_generate_scraper_requests(const ScraperSearchParams& params, std::queue< std::unique_ptr<ScraperRequest> >& requests,
std::vector<ScraperSearchResult>& results);
class ScreenScraperRequest : public ScraperHttpRequest
{
public:
// ctor for a GetGameList request
ScreenScraperRequest(std::queue< std::unique_ptr<ScraperRequest> >& requestsWrite, std::vector<ScraperSearchResult>& resultsWrite, const std::string& url) : ScraperHttpRequest(resultsWrite, url), mRequestQueue(&requestsWrite) {}
// ctor for a GetGame request
ScreenScraperRequest(std::vector<ScraperSearchResult>& resultsWrite, const std::string& url) : ScraperHttpRequest(resultsWrite, url), mRequestQueue(nullptr) {}
// Settings for the scraper
static const struct ScreenScraperConfig {
std::string getGameSearchUrl(const std::string gameName) const;
// Access to the API
const std::string API_DEV_U = { 91, 32, 7, 17 };
const std::string API_DEV_P = { 108, 28, 54, 55, 83, 43, 91, 44, 30, 22, 41, 12, 0, 108, 38, 29 };
const std::string API_DEV_KEY = { 54, 73, 115, 100, 101, 67, 111, 107, 79, 66, 68, 66, 67, 56, 118, 77, 54, 88, 101, 54 };
const std::string API_URL_BASE = "https://screenscraper.fr/api2";
const std::string API_SOFT_NAME = "Emulationstation " + static_cast<std::string>(PROGRAM_VERSION_STRING);
/** Which type of image artwork we need. Possible values (not a comprehensive list):
- ss: in-game screenshot
- box-3D: 3D boxart
- box-2D: 2D boxart (default)
- screenmarque : marquee
- sstitle: in-game start screenshot
- steamgrid: Steam artwork
- wheel: spine
- support-2D: media showing the 2d boxart on the cart
- support-3D: media showing the 3d boxart on the cart
Note that no all games contain values for these, so we default to "box-2D" since it's the most common.
**/
std::string media_name = "box-2D";
// Which Region to use when selecting the artwork
// Applies to: artwork, name of the game, date of release
std::string region = "US";
// Which Language to use when selecting the textual information
// Applies to: description, genre
std::string language = "EN";
ScreenScraperConfig() {};
} configuration;
protected:
void process(const std::unique_ptr<HttpReq>& req, std::vector<ScraperSearchResult>& results) override;
void processList(const pugi::xml_document& xmldoc, std::vector<ScraperSearchResult>& results);
void processGame(const pugi::xml_document& xmldoc, std::vector<ScraperSearchResult>& results);
bool isGameRequest() { return !mRequestQueue; }
std::queue< std::unique_ptr<ScraperRequest> >* mRequestQueue;
};
#endif // ES_APP_SCRAPERS_SCREEN_SCRAPER_H

View file

@ -280,6 +280,20 @@ namespace Utils
} // format
// Simple XOR scrambling of a string, with an accompanying key
std::string scramble(const std::string& _input, const std::string& key)
{
std::string buffer = _input;
for (size_t i = 0; i < _input.size(); ++i)
{
buffer[i] = _input[i] ^ key[i];
}
return buffer;
} // scramble
} // String::
} // Utils::

View file

@ -25,7 +25,8 @@ namespace Utils
std::string removeParenthesis (const std::string& _string);
stringVector commaStringToVector(const std::string& _string);
std::string vectorToCommaString(stringVector _vector);
std::string format (const char* _string, ...);
std::string format (const char* _string, ...);
std::string scramble (const std::string& _input, const std::string& key);
} // String::