Différentes systèmes de gestion de base de données (SGBD) plus ou moins standardisé :
Base utilisée pour les exercices, pas de serveur à configurer embarqué dans un fichier sqlite.
Ensemble de n-uplets
Définition d’opération élémentaires permettant d’extraire les informations recherchées. Ces opérations prennent une (opérateurs unaires) ou deux relations (opérateurs binaires) et fournissent une relation en résultat.
SELECT * FROM table WHERE conditions
CREATE TABLE, ALTER TABLE, DROP TABLE
INSERT, UPDATE, DELETE
SELECT attr1 as a, attr2 as b
FROM table1 as t1, table2 as t2
WHERE t1.attr1=t2.attr2 AND t1.attr3> 5
Le résultat est l’ensemble de TOUS les n-uplets (pas de suppression des doublons) respectant la condition et contenant uniquement les attributs indiqués.
SELECT NomF, VilF, AdrF FROM Fournisseur
WHERE VilF LIKE 'Saint%' OR VilF LIKE '_aint%'
SELECT * FROM Fournisseur WHERE NomF BETWEEN 'A%' AND 'G%'
SELECT f.NomF, f.VilF, l.nomP FROM Fournisseur AS f, Livraison AS l
SELECT f.NomF, f.VilF, l.nomP
FROM Fournisseur AS f, Livraison AS l
WHERE f.NoF = l.NoF
SELECT f.NomF, f.VilF, l.nomP
FROM Fournisseur AS f, Livraison AS l
WHERE f.NoF = l.NoF and f.VilF LIKE 'P%'
SELECT COUNT(DISTINCT NomF), VilF FROM Fournisseur GROUP BY VilF
Comptage des fournisseurs pour les villes ayant + de 5 fournisseurs :
SELECT COUNT(DISTINCT NomF), VilF FROM Fournisseur
GROUP BY VilF HAVING COUNT(DISTINCT NomF)>5
SELECT NomF, VilF, AdrF
FROM Fournisseur
WHERE VilF LIKE 'Saint%'
ORDER BY NomF DESC LIMIT 10
SELECT MIN(strftime('%s',l.Date)), VilF, AdrF, NomF
FROM Livraison AS l, fournisseur AS f GROUP BY NomF
CREATE TABLE artist(
artistid INTEGER PRIMARY KEY,
artistname TEXT
);
CREATE UNIQUE INDEX artistname ON artist(artistname);
CREATE TABLE track(
trackid INTEGER,
trackname TEXT,
trackartist INTEGER,
FOREIGN KEY(trackartist) REFERENCES artist(artistid)
);
CREATE TABLE realisateur(
realid INTEGER PRIMARY KEY,
name TEXT,
adresse TEXT
};
CREATE TABLE producteur(
prodid INTEGER PRIMARY KEY,
name TEXT,
adresse TEXT
};
CREATE TABLE acteurs(
acteurid INTEGER PRIMARY KEY,
name TEXT,
adresse TEXT
};
CREATE TABLE film(
filmid INTEGER PRIMARY KEY,
budget INTEGER,
recette INTEGER,
realid INTEGER,
FOREIGN KEY(realid) REFERENCES realisateur(realid),
prodid INTEGER,
FOREIGN KEY(prodid) REFERENCES producteur(prodid)
);
CREATE TABLE role(
filmid INTEGER NOT NULL,
FOREIGN KEY(filmid) REFERENCES film(filmid),
acteurid INTEGER NOT NULL,
FOREIGN KEY(acteurid) REFERENCES acteur(acteurid),
PRIMARY KEY(filmid,acteurid)
);
/* mise à jour */
UPDATE artist SET artistname = 'unknown' WHERE artistname = ''
/* suppression */
DELETE FROM artist WHERE artistname = ''
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
# importer a partir d'un fichier
routes = read.csv("./data/exo4.gtfs/routes.txt")
sqldf("select route_id, route_desc from routes limit 5")
## route_id route_desc
## 1 1 Majeure
## 2 2 Majeure
## 3 3 Majeure
## 4 4 Chronostar
## 5 5 Majeure
La structures des tables correpond à celle du format gtfs défini ici https://developers.google.com/transit/gtfs/
routes = read.csv("./data/exo4.gtfs/routes.txt")
agency = read.csv("./data/exo4.gtfs/agency.txt")
calendar = read.csv("./data/exo4.gtfs/calendar.txt")
stop_times = read.csv("./data/exo4.gtfs/stop_times.txt")
stops = read.csv("./data/exo4.gtfs/stops.txt")
trips = read.csv("./data/exo4.gtfs/trips.txt")
SELECT count(*) FROM trips
en R, en utilisant sqldf :
sqldf("select count(*) from trips")
## count(*)
## 1 29130
sqldf("select count(*) from routes")
## count(*)
## 1 104
sqldf("select count(*) from stops")
## count(*)
## 1 1442
SELECT * FROM stop_times LIMIT 5
en R, en utilisant sqldf :
sqldf("select trip_id,arrival_time from stop_times limit 5")
## trip_id arrival_time
## 1 1 05:23:00
## 2 1 05:25:00
## 3 1 05:26:00
## 4 1 05:26:00
## 5 1 05:27:00
SELECT COUNT(*) AS nbstops, * FROM stop_times GROUP BY trip_id
en R, en utilisant sqldf :
query="select count(*) as nbstops, trip_id from stop_times
group by trip_id limit 5"
sqldf(query)
## nbstops trip_id
## 1 30 1
## 2 30 2
## 3 30 3
## 4 30 4
## 5 30 5
SELECT COUNT(*) AS nbstops, *
FROM stop_times
GROUP BY trip_id
ORDER BY nbstops
DESC LIMIT 1
en R, en utilisant sqldf :
query="select count(*) as nbstops, * from stop_times
group by trip_id order by nbstops desc limit 1"
sqldf(query)
## nbstops trip_id arrival_time departure_time stop_id stop_sequence
## 1 42 25963 08:20:00 08:20:00 2846 40
## stop_headsign pickup_type drop_off_type shape_dist_traveled
## 1 NA 0 0 NA
SELECT COUNT(DISTINCT t.trip_id), t.route_id, r.route_long_name
FROM trips AS t, routes AS r
WHERE t.route_id=r.route_id
GROUP BY r.route_id
HAVING COUNT(DISTINCT t.trip_id)>500
en R, en utilisant sqldf :
query="select count(distinct t.trip_id), t.route_id, r.route_long_name
from trips as t, routes as r
where t.route_id=r.route_id
group by r.route_id
having count(distinct t.trip_id)>500 limit 5"
sqldf(query)
## count(distinct t.trip_id) route_id
## 1 1475 1
## 2 1215 2
## 3 1308 3
## 4 1932 4
## 5 1661 5
## route_long_name
## 1 Cesson-Sévigné (Champs Blancs) <> Chantepie (Rosa Parks)
## 2 Grand Quartier <> La Poterie / Haut Sancé
## 3 Saint-Laurent <> Henri Fréville
## 4 Beauregard <> ZA Saint-Sulpice
## 5 Patton <> Lycée Bréquigny
SELECT COUNT(*) AS nbtrips, stop_id
FROM stop_times
GROUP BY stop_id
ORDER BY nbtrips DESC
LIMIT 1
en R, en utilisant sqldf :
query="select count(*) as nbtrips, stop_id
from stop_times
group by stop_id
order by nbtrips desc
limit 1"
sqldf(query)
## nbtrips stop_id
## 1 3236 1027
SELECT COUNT(DISTINCT ro.route_id) AS nbroutes,
st.stop_id, tr.trip_id
FROM stop_times AS st,trips AS tr
WHERE st.trip_id=tr.trip_id
GROUP BY stop_id
en R, en utilisant sqldf :
query="select count(distinct tr.route_id) as nbroutes,
st.stop_id, tr.trip_id
from stop_times as st,trips as tr
where st.trip_id=tr.trip_id
group by stop_id limit 5"
sqldf(query)
## nbroutes stop_id trip_id
## 1 2 1001 5576
## 2 2 1002 5576
## 3 2 1003 5576
## 4 3 1004 999
## 5 3 1005 999
SELECT COUNT(DISTINCT tr.route_id) AS nbroutes,
st.stop_id, stn.stop_name
FROM stops AS stn, stop_times AS st,trips AS tr
WHERE st.stop_id=stn.stop_id AND st.trip_id=tr.trip_id
GROUP BY st.stop_id
ORDER BY nbroutes DESC LIMIT 5"
en R, en utilisant sqldf :
query="select count(distinct tr.route_id) as nbroutes,
st.stop_id, stn.stop_name
from stops as stn, stop_times as st,trips as tr
where st.stop_id=stn.stop_id and st.trip_id=tr.trip_id
group by st.stop_id
order by nbroutes desc limit 5"
sqldf(query)
## nbroutes stop_id stop_name
## 1 12 1610 Tournebride
## 2 10 1160 Tournebride
## 3 8 1007 Assomption
## 4 8 1011 Fac de Droit
## 5 8 1033 Assomption
SELECT st.stop_id, s.stop_name, MIN(st.departure_time)
FROM stop_times AS st, stops AS s
WHERE st.stop_id=s.stop_id
GROUP BY st.stop_id
en R, en utilisant sqldf :
query="select st.stop_id, s.stop_name, min(st.departure_time)
from stop_times as st, stops as s
where st.stop_id=s.stop_id
group by st.stop_id limit 5"
sqldf(query)
## stop_id stop_name min(st.departure_time)
## 1 1001 Longs Champs 04:25:00
## 2 1002 Bouzat 04:26:00
## 3 1003 Gallets 04:26:00
## 4 1004 Donzelot 04:26:00
## 5 1005 Mirabeau 04:27:00
SELECT DISTINCT st.stop_id, s.stop_name, st.departure_time
FROM stop_times AS st, stops AS s
WHERE st.departure_time BETWEEN '02:00:00' AND '05:00:00'
AND s.stop_id=st.stop_id
en R, en utilisant sqldf :
query="select distinct st.stop_id, s.stop_name, st.departure_time
from stop_times as st, stops as s
where st.departure_time between '02:00:00' and '05:00:00'
and s.stop_id=st.stop_id limit 5"
sqldf(query)
## stop_id stop_name departure_time
## 1 1051 Place de Bretagne 03:56:00
## 2 1052 République 03:55:00
## 3 1192 La Salle 04:03:00
## 4 1216 Le Mail 03:58:00
## 5 1217 Chèques Postaux 03:59:00
SELECT * FROM stops
WHERE wheelchair_boarding=2
en R, en utilisant sqldf :
query="select stop_id, stop_name from stops
where wheelchair_boarding=2 limit 5"
sqldf(query)
## stop_id stop_name
## 1 1015 Champ Jacquet
## 2 1017 Les Halles
## 3 1040 Donelière
## 4 1041 Trois Croix
## 5 1042 Cimetière Nord
SELECT trip_id, MIN(arrival_time), MAX(arrival_time),
MAX(STRFTIME('%s','2014-01-01 ' || arrival_time))
- MIN(STRFTIME('%s','2014-01-01 ' || arrival_time)) as time
FROM stop_times GROUP BY trip_id
en R, en utilisant sqldf :
query="select trip_id, min(arrival_time), max(arrival_time),
max(strftime('%s',arrival_time))
- min(strftime('%s', arrival_time)) as time
from stop_times group by trip_id
order by time desc limit 5"
sqldf(query)
## trip_id min(arrival_time) max(arrival_time) time
## 1 8265 17:09:00 18:18:00 4140
## 2 8266 17:09:00 18:18:00 4140
## 3 8267 17:09:00 18:18:00 4140
## 4 8268 17:09:00 18:18:00 4140
## 5 8269 17:09:00 18:18:00 4140
Vous pourrez utiliser un second traitement effectué en R
SELECT departure_time,
strftime('%s','2014-09-23' || arrival_time) AS depsec
FROM stop_times AS st, stops AS s, trips AS tr, calendar AS cal
WHERE s.stop_id=st.stop_id AND s.stop_name = 'Gayeulles Piscine'
AND tr.trip_id=st.trip_id AND tr.service_id=cal.service_id
AND cal.tuesday=1
AND cal.start_date<'20140923' AND cal.end_date>'20140923'
ORDER BY st.departure_time
en R, en utilisant sqldf + moyenne des différences :
query="select departure_time,
strftime('%s','2014-09-23' || arrival_time) as depsec
from stop_times as st, stops as s, trips as tr, calendar as cal
where s.stop_id=st.stop_id and s.stop_name = 'Gayeulles Piscine'
and tr.trip_id=st.trip_id and tr.service_id=cal.service_id
and cal.tuesday=1 and cal.start_date<'20140923'
and cal.end_date>'20140923' order by st.departure_time"
# stockage de la requête dans la data.frame horaires
horaires=sqldf(query)
# calcul de la fréquence moyenne
mean(diff(as.numeric(sort(horaires$depsec))))
## [1] 392.6966