7/19/2021

SQL

Mysql
/usr/local/mysql/bin/mysql -uroot -p
 
 
sql不分大小写!
 
SELECT * FROM people LIMIT 10;
SELECT DISTINCT language FROM films;
SELECT COUNT(*) FROM people;
SELECT COUNT(birthdate) FROM people;
SELECT COUNT(DISTINCT birthdate) FROM people;

SELECT title FROM films WHERE title = 'Metropolis'; must be single quot
SELECT * FROM films WHERE budget > 10000;
Use <> but not !=

SELECT title FROM films WHERE release_year > 1994 AND release_year < 2000;
SELECT title FROM films WHERE release_year > 1994 AND < 2000;
SELECT title FROM films WHERE (release_year = 1994 OR release_year = 1995) AND (certification = 'PG' OR certification = 'R');
SELECT title FROM films WHERE release_year BETWEEN 1994 AND 2000;
SELECT name FROM kids WHERE age IN (2, 4, 6, 8, 10);
SELECT name FROM people WHERE birthdate IS NOT NULL;
SELECT name FROM companies WHERE name LIKE 'Data%';  
SELECT name FROM companies WHERE name LIKE 'Data%';
SELECT AVG(budget) FROM films;
 
MAX () MIN() SUM() AVG(), but cant use after WHERE clause
 
SELECT (4.0 / 3.0) AS result;  1.333
SELECT (4 / 3); = 1
 
 Can perform + - * /

SELECT MAX(budget) AS max_budget,
       MAX(duration) AS max_duration
FROM films;
 
SELECT AVG(duration/60.0) AS avg_duration_hours
FROM films;  必须是60.0 不然不行

SELECT title FROM films
ORDER BY release_year, title DESC;

SELECT sex, count(*)
FROM employees
GROUP BY sex
ORDER BY count DESC;

SELECT release_year
FROM films
GROUP BY release_year
HAVING COUNT(title) > 10;

SELECT title, imdb_score FROM films
JOIN reviews
ON films.id = reviews.film_id
WHERE title = 'To Kill a Mockingbird';

SELECT cities.name AS city, countries.name AS country
FROM cities
  INNER JOIN countries
    ON cities.country_code = countries.code;

SELECT c1.name AS city, c2.name AS country
FROM cities AS c1
INNER JOIN countries AS c2
ON c1.country_code = c2.code; 

SELECT *
FROM left_table
  INNER JOIN right_table
    ON left_table.id = right_table.id
  INNER JOIN another_table
    ON left_table.id = another_table.id;

SELECT *
FROM countries
  INNER JOIN economies
    USING(code)

SELECT name, continent, code, surface_area,
    CASE WHEN surface_area > 2000000
            THEN 'large'
       WHEN surface_area > 350000
            THEN 'medium'
       ELSE 'small' END
       AS geosize_group
INTO countries_plus
FROM countries;
Multiple line comment /* And */

Left join, right join, full join

SELECT NAME
  FROM cities
EXCEPT / intersect / Union/ Union all
SELECT capital
  FROM countries
ORDER BY name; 
Semi-join
SELECT DISTINCT name
FROM languages
WHERE code IN
  (SELECT code
   FROM countries
   WHERE region = 'Middle East')
ORDER BY name;

SELECT max(inflation_rate) as max_inf
  FROM (
      SELECT name, continent, inflation_rate
      FROM countries
      INNER JOIN economies
      USING (code)
  WHERE year = 2015) AS subquery
GROUP BY continent;

SELECT name, continent, inflation_rate
  FROM countries
INNER JOIN economies
USING (code)
  WHERE year = 2015
    and inflation_rate IN (
        SELECT MAX(inflation_rate) AS max_inf
        FROM (
             SELECT name, continent, inflation_rate
             FROM countries
             INNER JOIN economies
             USING (code)
             WHERE year = 2015) AS subquery
        GROUP BY continent);
 
SELECT DISTINCT nametotal_investmentimports
  FROM countries AS c
    LEFT JOIN economies AS e
      ON (c.code = e.code
        AND c.code IN (
          SELECT l.code
          FROM languages AS l
          WHERE official = 'true'
        ) )
  WHERE region = 'Central America' AND year = 2015
ORDER BY name;
 
 
SELECT regioncontinentAVG(fertility_rateAS avg_fert_rate
  FROM countries AS c
    INNER JOIN populations AS p
      ON c.code = p.country_code
  WHERE year = 2015
GROUP BY regioncontinent
ORDER BY avg_fert_rate;
 
SELECT cities.namecities.country_codecity_proper_popmetroarea_pop
      city_proper_pop / metroarea_pop * 100.0 AS city_perc
  FROM cities
  WHERE name IN
    (SELECT capital
     FROM countries
     WHERE continent = 'Europe'
        OR continent LIKE '%America')
        AND metroarea_pop IS NOT NULL
ORDER BY city_perc DESC
LIMIT 10
 
SELECT * FROM table WHERE CASE WHEN a > 5 THEN 'Keep'
WHEN a <= 5 THEN 'Exclude' END = 'Keep'; 
 
SELECT season, date, home_goal, away_goal FROM matches_italy WHERE -- Exclude games not won by Bologna CASE WHEN hometeam_id = 9857 and home_goal > away_goal THEN 'Bologna Win' WHEN awayteam_id = 9857 AND away_goal > home_goal THEN 'Bologna Win'
END IS NOT NULL; 

SELECT c.name AS country, -- Count games from the 2012/2013 season COUNT (CASE WHEN m.season = '2012/2013' THEN m.id ELSE NULL END) AS matches_2012_2013 FROM country AS c LEFT JOIN match AS m ON c.id = m.country_id -- Group by country name alias GROUP BY country;

-- percentage of game tied
SELECT c.name AS country, -- Round the percentage of tied games to 2 decimal points ROUND(AVG(CASE WHEN m.season='2013/2014' AND m.home_goal = m.away_goal THEN 1 WHEN m.season='2013/2014' AND m.home_goal != m.away_goal THEN 0 END),2) AS pct_ties_2013_2014, ROUND(AVG(CASE WHEN m.season='2014/2015' AND m.home_goal = m.away_goal THEN 1 WHEN m.season='2014/2015' AND m.home_goal != m.away_goal THEN 0 END),2) AS pct_ties_2014_2015 FROM country AS c LEFT JOIN matches AS m ON c.id = m.country_id GROUP BY country;


Windows:

SELECT m.away_goal, AVG(m.home_goal + m.away_goal) over() AS overall_avg FROM match AS m

SELECT id, RANK() OVER(ORDER BY home_goal) AS rank FROM match;

rank() over(ORDER BY AVG(m.home_goal + m.away_goal)) AS league_rank ORDER BY league_rank;

SELECT id, RANK() OVER(ORDER BY home_goal DESC) AS rank FROM match;

AVG(home_goal) OVER(PARTITION BY season, EXTRACT(month FROM date)) AS season_mo_home,


Sliding to the left:

SELECT date, home_goal, away_goal, -- Create a running total and running average of home goals SUM(home_goal) over(ORDER BY DATE rows between unbounded preceding and current row) AS running_total, AVG(home_goal) OVER(ORDER BY date rows between unbounded preceding and current row) AS running_avg FROM match WHERE hometeam_id = 9908 AND season = '2011/2012';

Slide to the right:

-- Create a running total and running average of home goals SUM(home_goal) over(ORDER BY DATE DESC ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS running_total, AVG(home_goal) OVER(ORDER BY DATE DESC ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS running_avg



-- Set up the home team CTE with home AS( SELECT m.id, t.team_long_name, CASE WHEN m.home_goal > m.away_goal THEN 'MU Win' WHEN m.home_goal < m.away_goal THEN 'MU Loss' ELSE 'Tie' END AS outcome FROM match AS m LEFT JOIN team AS t ON m.hometeam_id = t.team_api_id), -- Set up the away team CTE away AS( SELECT m.id, t.team_long_name, CASE WHEN m.home_goal > m.away_goal THEN 'MU Loss' WHEN m.home_goal < m.away_goal THEN 'MU Win' ELSE 'Tie' END AS outcome FROM match AS m LEFT JOIN team AS t ON m.awayteam_id = t.team_api_id) -- Select columns and and rank the matches by date SELECT DISTINCT m.DATE, home.team_long_name AS home_team, away.team_long_name AS away_team, m.home_goal, m.away_goal, rank() over(ORDER BY ABS(home_goal - away_goal) DESC) as match_rank -- Join the CTEs onto the match table FROM match AS m LEFT JOIN home ON m.id = home.id LEFT JOIN away ON m.id = away.id WHERE m.season = '2014/2015' AND ((home.team_long_name = 'Manchester United' AND home.outcome = 'MU Loss') OR (away.team_long_name = 'Manchester United' AND away.outcome = 'MU Loss'));



7/15/2021

awk print the first 4 and last 4 letters from a string

 cat tmp | awk '{print $1","substr($1,1,4)"_"substr($1,length($1)-3,length($1))}' > input_list_combine.table

7/14/2021

Data cleaning

 


Df[sth].dtype()
Df[sth].astype(‘category’)
Assert 1+1 == 2





























$ from fuzzywuzzy import fuzz

$ fuzz.WRaio("Reeding", "Reading")






Matches.index shows column names



7/10/2021

Import

$ data = np.genfromtxt('titanic.csv', delimiter=',', names=True, dtype=None)

$ np.recfromcsv(‘file_name’, nrows= 5, header = None)
$ pd.read_csv()
$ pd.read_table()

data = pd.read_csv(file, sep='\t', comment='#', na_values='Nothing')
 

Magics: 
!ls 
import os
wd = os.getcwd()
os.listdir(wd)

df1 = xls.parse(0, skiprows= 1, names=['Country', 'AAM due to War (2002)'])

df2 = xls.parse(0, usecols= 0, skiprows=1, names=['z', 'A'])


 

# Get the HDF5 group: group
group = data['strain']
 
# Check out keys of group
for key in group.keys():
    print(key)

# Set variable equal to time series data: strain
strain = data['strain']['Strain'].value

# Set number of time points to sample: num_samples
num_samples = 10000

# Set time vector
time = np.arange(0, 1, 1/num_samples)

# Plot data
plt.plot(time, strain[:num_samples])
plt.xlabel('GPS Time (s)')
plt.ylabel('strain')
plt.show()



# Print the keys of the MATLAB dictionary
print(mat.keys())

# Print the type of the value corresponding to the key 'CYratioCyt'
print(type(mat['CYratioCyt']))

# Print the shape of the value corresponding to the key 'CYratioCyt'
print(mat['CYratioCyt'].shape)

# Subset the array and plot it
data = mat['CYratioCyt'][25, 5:]
fig = plt.figure()
plt.plot(data)
plt.xlabel('time (min.)')
plt.ylabel('normalized fluorescence (measure of expression)')
plt.show()



# Import necessary module
from sqlalchemy import create_engine

# Create engine: engine
engine = create_engine('sqlite:///Chinook.SQLite')

# Import necessary module
from sqlalchemy import create_engine

# Create engine: engine
engine = create_engine('sqlite:///Chinook.sqlite')

# Save the table names to a list: table_names
table_names = engine.table_names()

# Print the table names to the shell
print(table_names)





# Import packages
from sqlalchemy import create_engine
import pandas as pd

# Create engine: engine
engine = create_engine('sqlite:///Chinook.sqlite')

# Open engine connection: con
con = engine.connect()

# Perform query: rs
rs = con.execute("SELECT * FROM album")

# Save results of the query to DataFrame: df
df = pd.DataFrame(rs.fetchall())

# Close connection

con.close()
# Print head of DataFrame df
print(df.head())

# Open engine in context manager
# Perform query and save results to DataFrame: df
with engine.connect() as con:
    rs = con.execute("SELECT LastName, Title FROM Employee")
    df = pd.DataFrame(rs.fetchmany(size=3))
    df.columns = rs.keys()

# Open engine in context manager
# Perform query and save results to DataFrame: df
with engine.connect() as con:
    rs = con.execute("SELECT * FROM Employee WHERE EmployeeId >= 6")
    df = pd.DataFrame(rs.fetchall())
    df.columns = rs.keys()

with engine.connect() as con:
    rs = con.execute("SELECT * FROM Employee ORDER BY BirthDate")
    df = pd.DataFrame(rs.fetchall())



# Execute query and store records in DataFrame: df
df = pd.read_sql_query("SELECT * FROM album", engine)

df = pd.read_sql_query("SELECT * FROM Employee WHERE EmployeeId >= 6 ORDER BY BirthDate", engine)



with engine.connect() as con:
    rs=con.execute("SELECT Title, Name FROM Album INNER JOIN Artist on Album.ArtistID = Artist.ArtistID")
    df=pd.DataFrame(rs.fetchall())
    df.columns=rs.keys()

df = pd.read_sql_query("SELECT * FROM PlaylistTrack INNER JOIN Track on PlaylistTrack.TrackId = Track.TrackId WHERE Milliseconds < 250000", engine)

7/06/2021

Manage data

Df[A & B]

Df[ A | B]

Df[ ~B] : 不是boolean B的subset


Pmf(df.col, normalize=False)


Cdf(df[age])
Q = cdf_income.inverse(0.75) : 75th percentile income






Df_subset.Corr()